Commit 7ceeed3e by James Cropcho

Merge pull request #72 from todvora/unnamed-objects-counts

#69 - fixed wrong percentage indications in unnamed objects inside arrays
parents 420588e9 a24d38c8
...@@ -23,7 +23,7 @@ class SampleData { ...@@ -23,7 +23,7 @@ class SampleData {
"| name | String | 5 | 100 |\n" + "| name | String | 5 | 100 |\n" +
"| bio | String | 3 | 60 |\n" + "| bio | String | 3 | 60 |\n" +
"| birthday | String | 2 | 40 |\n" + "| birthday | String | 2 | 40 |\n" +
"| pets | String,Array | 2 | 40 |\n" + "| pets | Array,String | 2 | 40 |\n" +
"| someBinData | BinData-old | 1 | 20 |\n" + "| someBinData | BinData-old | 1 | 20 |\n" +
"| someWeirdLegacyKey | String | 1 | 20 |\n" + "| someWeirdLegacyKey | String | 1 | 20 |\n" +
"+------------------------------------------------------------+"; "+------------------------------------------------------------+";
......
...@@ -9,6 +9,8 @@ import org.junit.Assert; ...@@ -9,6 +9,8 @@ import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import java.util.Arrays;
/** /**
* Test, how variety handles objects, that are not named (for example objects inside array). * Test, how variety handles objects, that are not named (for example objects inside array).
* It addresses behavior described in issue https://github.com/variety/variety/issues/29 * It addresses behavior described in issue https://github.com/variety/variety/issues/29
...@@ -20,8 +22,14 @@ public class UnnamedObjectsAnalysisTest { ...@@ -20,8 +22,14 @@ public class UnnamedObjectsAnalysisTest {
@Before @Before
public void setUp() throws Exception { public void setUp() throws Exception {
this.variety = new Variety("test", "users"); this.variety = new Variety("test", "users");
variety.getSourceCollection().insert((DBObject) JSON.parse("{title:'Article 1', comments:[{author:'John', body:'it works', visible:true }]}")); variety.getSourceCollection().insert(Arrays.asList(
variety.getSourceCollection().insert((DBObject) JSON.parse("{title:'Article 2', comments:[{author:'Tom', body:'thanks'}]}")); createDbObj("{title:'Article 1', comments:[{author:'John', body:'it works', visible:true }]}"),
createDbObj("{title:'Article 2', comments:[{author:'Tom', body:'thanks'}, {author:'Mark', body:1}]}")
));
}
private DBObject createDbObj(final String json) {
return (DBObject) JSON.parse(json);
} }
@After @After
...@@ -42,7 +50,7 @@ public class UnnamedObjectsAnalysisTest { ...@@ -42,7 +50,7 @@ public class UnnamedObjectsAnalysisTest {
// unnamed objects are prefixed with .XX key // unnamed objects are prefixed with .XX key
analysis.validate("comments.XX.author", 2, 100, "String"); analysis.validate("comments.XX.author", 2, 100, "String");
analysis.validate("comments.XX.body", 2, 100, "String"); analysis.validate("comments.XX.body", 2, 100, "String", "Number");
analysis.validate("comments.XX.visible", 1, 50, "Boolean"); analysis.validate("comments.XX.visible", 1, 50, "Boolean");
} }
} }
...@@ -175,44 +175,25 @@ var serializeDoc = function(doc, maxDepth) { ...@@ -175,44 +175,25 @@ var serializeDoc = function(doc, maxDepth) {
return result; return result;
}; };
var interimResults = {}; //hold results here until converted to final format var mergeArrays = function(a, b) {
// main cursor if(typeof a === 'undefined') {a = [];}
var numDocuments = 0; return a.concat(b) // merge two arrays into one, including duplications
db[collection].find($query).sort($sort).limit($limit).forEach(function(obj) { .filter(function(item, pos, self){return self.indexOf(item) == pos;}) // remove duplications
//printjson(obj) .sort(); // sort alphabetically
var flattened = serializeDoc(obj, $maxDepth); };
//printjson(flattened)
for (var key in flattened){ // convert document to key-value map, where value is always an array with types as plain strings
var value = flattened[key]; var analyseDocument = function(document) {
var result = {};
for (var key in document) {
var value = document[key];
//translate unnamed object key from {_parent_name_}.{_index_} to {_parent_name_}.XX //translate unnamed object key from {_parent_name_}.{_index_} to {_parent_name_}.XX
key = key.replace(/\.\d+/g,'.XX'); key = key.replace(/\.\d+/g,'.XX');
result[key] = mergeArrays(result[key], varietyTypeOf(value));
var valueType = varietyTypeOf(value);
if(!(key in interimResults)){ //if it's a new key we haven't seen yet
interimResults[key] = {'types':[valueType],'totalOccurrences':1};
}
else{ //we've seen this key before
if(interimResults[key]['types'].indexOf(valueType) == -1) {
interimResults[key]['types'].push(valueType);
}
interimResults[key]['totalOccurrences']++;
}
} }
numDocuments++; return result;
}); };
var varietyResults = [];
//now convert the interimResults into the proper format
for(var key in interimResults){
var entry = interimResults[key];
var newEntry = {};
newEntry['_id'] = {'key':key};
newEntry['value'] = {'types':entry['types']};
newEntry['totalOccurrences'] = entry['totalOccurrences'];
newEntry['percentContaining'] = entry['totalOccurrences']*100/$limit;
varietyResults.push(newEntry);
}
// We throw away keys which end in an array index, since they are not useful // We throw away keys which end in an array index, since they are not useful
// for our analysis. (We still keep the key of their parent array, though.) -JC // for our analysis. (We still keep the key of their parent array, though.) -JC
...@@ -220,30 +201,44 @@ var filter = function(item) { ...@@ -220,30 +201,44 @@ var filter = function(item) {
return !item._id.key.match(/\.XX$/); return !item._id.key.match(/\.XX$/);
}; };
var map = function(item) {
var keyName = item._id.key;
if(keyName.match(/\.XX/)) {
// exists query checks for embedded values for an array
// ie. match {arr:[{x:1}]} with {'arr.x':{$exists:true}}
// just need to pull out .XX in this case
keyName = keyName.replace(/.XX/g,'');
}
// we don't need to set it if limit isn't being used. (it's set above.)
if($limit < numDocuments) {
item.totalOccurrences = db[collection].count($query);
}
item.percentContaining = (item.totalOccurrences / numDocuments) * 100.0;
return item;
};
// sort desc by totalOccurrences or by key asc if occurrences equal // sort desc by totalOccurrences or by key asc if occurrences equal
var comparator = function(a, b) { var comparator = function(a, b) {
var countsDiff = b.totalOccurrences - a.totalOccurrences; var countsDiff = b.totalOccurrences - a.totalOccurrences;
return countsDiff !== 0 ? countsDiff : a._id.key.localeCompare(b._id.key); return countsDiff !== 0 ? countsDiff : a._id.key.localeCompare(b._id.key);
}; };
log('removing leaf arrays in results collection, and getting percentages'); var reduceDocuments = function(accumulator, docResult, index, array) {
varietyResults = varietyResults.filter(filter).map(map).sort(comparator); var duplicityCheck = function(item){return item.key === key;};
for (var key in docResult) {
var known = accumulator.filter(duplicityCheck);
if(known.length > 0) {
var existing = known[0];
existing.types = mergeArrays(docResult[key], existing.types);
existing.totalOccurrences = existing.totalOccurrences + 1;
} else {
accumulator.push({'key':key, 'types':docResult[key], 'totalOccurrences':1});
}
}
return accumulator;
};
var computePercentages = function(entry){
return {
'_id':{'key':entry.key},
'value': {'types':entry.types},
'totalOccurrences': entry.totalOccurrences,
'percentContaining': entry.totalOccurrences*100/$limit
};
};
// the main processing pipe
var varietyResults = db[collection].find($query).sort($sort).limit($limit) // read data from the mongodb
.map(function(obj) {return serializeDoc(obj, $maxDepth);}) // flatten structure, create compound keys
.map(analyseDocument) // analyse keys and types of document, filtering duplicities
.reduce(reduceDocuments, []) // merge all keys and types
.map(computePercentages) // add percentages, reformat results to expected structure
.filter(filter) // throw away keys which end in an array index
.sort(comparator); // sort by occurrences and alphabet
if($persistResults) { if($persistResults) {
var resultsDB = db.getMongo().getDB('varietyResults'); var resultsDB = db.getMongo().getDB('varietyResults');
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment