Merge pull request #72 from todvora/unnamed-objects-counts

#69 - fixed wrong percentage indications in unnamed objects inside arrays

Merge pull request #72 from todvora/unnamed-objects-counts
#69 - fixed wrong percentage indications in unnamed objects inside arrays
7ceeed3e · James Cropcho · 420588e9 · a24d38c8 · 7ceeed3e · 7ceeed3e
Commit 7ceeed3e authored Feb 25, 2015 by James Cropcho
Showing with 59 additions and 56 deletions

SampleData.java test/src/test/java/com/github/variety/test/SampleData.java +1 -1

UnnamedObjectsAnalysisTest.java ...a/com/github/variety/test/UnnamedObjectsAnalysisTest.java +11 -3

variety.js variety.js +47 -52

No files found.
--- a/test/src/test/java/com/github/variety/test/SampleData.java
+++ b/test/src/test/java/com/github/variety/test/SampleData.java
@@ -23,7 +23,7 @@ class SampleData {
            "| name               | String       | 5           | 100      |\n" +
            "| bio                | String       | 3           | 60       |\n" +
            "| birthday           | String       | 2           | 40       |\n" +
-            "| pets               | String,Array | 2           | 40       |\n" +
+            "| pets               | Array,String | 2           | 40       |\n" +
            "| someBinData        | BinData-old  | 1           | 20       |\n" +
            "| someWeirdLegacyKey | String       | 1           | 20       |\n" +
            "+------------------------------------------------------------+";

--- a/test/src/test/java/com/github/variety/test/UnnamedObjectsAnalysisTest.java
+++ b/test/src/test/java/com/github/variety/test/UnnamedObjectsAnalysisTest.java
@@ -9,6 +9,8 @@ import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;

+import java.util.Arrays;
+
 /**
 * Test, how variety handles objects, that are not named (for example objects inside array).
 * It addresses behavior described in issue https://github.com/variety/variety/issues/29
@@ -20,8 +22,14 @@ public class UnnamedObjectsAnalysisTest {
    @Before
    public void setUp() throws Exception {
        this.variety = new Variety("test", "users");
-        variety.getSourceCollection().insert((DBObject) JSON.parse("{title:'Article 1', comments:[{author:'John', body:'it works', visible:true }]}"));
-        variety.getSourceCollection().insert((DBObject) JSON.parse("{title:'Article 2', comments:[{author:'Tom', body:'thanks'}]}"));
+        variety.getSourceCollection().insert(Arrays.asList(
+                createDbObj("{title:'Article 1', comments:[{author:'John', body:'it works', visible:true }]}"),
+                createDbObj("{title:'Article 2', comments:[{author:'Tom', body:'thanks'}, {author:'Mark', body:1}]}")
+        ));
+    }
+
+    private DBObject createDbObj(final String json) {
+        return (DBObject) JSON.parse(json);
    }

    @After
@@ -42,7 +50,7 @@ public class UnnamedObjectsAnalysisTest {

        // unnamed objects are prefixed with .XX key
        analysis.validate("comments.XX.author", 2, 100, "String");
-        analysis.validate("comments.XX.body", 2, 100, "String");
+        analysis.validate("comments.XX.body", 2, 100, "String", "Number");
        analysis.validate("comments.XX.visible", 1, 50, "Boolean");
    }
 }
--- a/variety.js
+++ b/variety.js
@@ -175,44 +175,25 @@ var serializeDoc = function(doc, maxDepth) {
  return result;
 };

-var interimResults = {}; //hold results here until converted to final format
-// main cursor
-var numDocuments = 0;
-db[collection].find($query).sort($sort).limit($limit).forEach(function(obj) {
-  //printjson(obj)
-  var flattened = serializeDoc(obj, $maxDepth);
-  //printjson(flattened)
-  for (var key in flattened){
-    var value = flattened[key];
+var mergeArrays = function(a, b) {
+  if(typeof a === 'undefined') {a = [];}
+  return a.concat(b) // merge two arrays into one, including duplications
+        .filter(function(item, pos, self){return self.indexOf(item) == pos;}) // remove duplications
+        .sort(); // sort alphabetically
+};
+
+// convert document to key-value map, where value is always an array with types as plain strings
+var analyseDocument = function(document) {
+  var result = {};
+  for (var key in document) {
+    var value = document[key];

    //translate unnamed object key from {_parent_name_}.{_index_} to {_parent_name_}.XX
    key = key.replace(/\.\d+/g,'.XX');
-
-    var valueType = varietyTypeOf(value);
-    if(!(key in interimResults)){ //if it's a new key we haven't seen yet
-      interimResults[key] = {'types':[valueType],'totalOccurrences':1};
-    }
-    else{ //we've seen this key before
-      if(interimResults[key]['types'].indexOf(valueType) == -1) {
-        interimResults[key]['types'].push(valueType);
-      }
-      interimResults[key]['totalOccurrences']++;
-    }
+    result[key] = mergeArrays(result[key], varietyTypeOf(value));
  }
-    numDocuments++;
-});
-
-var varietyResults = [];
-//now convert the interimResults into the proper format
-for(var key in interimResults){
-  var entry = interimResults[key];
-  var newEntry = {};
-  newEntry['_id'] = {'key':key};
-  newEntry['value'] = {'types':entry['types']};
-  newEntry['totalOccurrences'] = entry['totalOccurrences'];
-  newEntry['percentContaining'] = entry['totalOccurrences']*100/$limit;
-  varietyResults.push(newEntry);
-}
+  return result;
+};

 // We throw away keys which end in an array index, since they are not useful
 // for our analysis. (We still keep the key of their parent array, though.) -JC
@@ -220,30 +201,44 @@ var filter = function(item) {
  return !item._id.key.match(/\.XX$/);
 };

-var map = function(item) {
-  var keyName = item._id.key;
-  if(keyName.match(/\.XX/)) {
-    // exists query checks for embedded values for an array
-    // ie. match {arr:[{x:1}]} with {'arr.x':{$exists:true}}
-    // just need to pull out .XX in this case
-    keyName = keyName.replace(/.XX/g,'');
-  }
-  // we don't need to set it if limit isn't being used. (it's set above.)
-  if($limit < numDocuments) {
-      item.totalOccurrences = db[collection].count($query);
-  }
-  item.percentContaining = (item.totalOccurrences / numDocuments) * 100.0;
-  return item;
-};
-
 // sort desc by totalOccurrences or by key asc if occurrences equal
 var comparator = function(a, b) {
  var countsDiff = b.totalOccurrences - a.totalOccurrences;
  return countsDiff !== 0 ? countsDiff : a._id.key.localeCompare(b._id.key);
 };

-log('removing leaf arrays in results collection, and getting percentages');
-varietyResults = varietyResults.filter(filter).map(map).sort(comparator);
+var reduceDocuments = function(accumulator, docResult, index, array) {
+  var duplicityCheck = function(item){return item.key === key;};
+  for (var key in docResult) {
+    var known = accumulator.filter(duplicityCheck);
+    if(known.length > 0) {
+      var existing = known[0];
+      existing.types = mergeArrays(docResult[key], existing.types);
+      existing.totalOccurrences = existing.totalOccurrences + 1;
+    } else {
+      accumulator.push({'key':key, 'types':docResult[key], 'totalOccurrences':1});
+    }
+  }
+  return accumulator;
+};
+
+var computePercentages = function(entry){
+  return {
+    '_id':{'key':entry.key},
+    'value': {'types':entry.types},
+    'totalOccurrences': entry.totalOccurrences,
+    'percentContaining': entry.totalOccurrences*100/$limit
+  };
+};
+
+// the main processing pipe
+var varietyResults = db[collection].find($query).sort($sort).limit($limit) // read data from the mongodb
+  .map(function(obj) {return serializeDoc(obj, $maxDepth);}) // flatten structure, create compound keys
+  .map(analyseDocument) // analyse keys and types of document, filtering duplicities
+  .reduce(reduceDocuments, []) // merge all keys and types
+  .map(computePercentages) // add percentages, reformat results to expected structure
+  .filter(filter) // throw away keys which end in an array index
+  .sort(comparator); // sort by occurrences and alphabet

 if($persistResults) {
  var resultsDB = db.getMongo().getDB('varietyResults');