......@@ -23,7 +23,7 @@ class SampleData {
"| name | String | 5 | 100 |\n" +
"| bio | String | 3 | 60 |\n" +
"| birthday | String | 2 | 40 |\n" +
"| pets | String,Array | 2 | 40 |\n" +
"| pets | Array,String | 2 | 40 |\n" +
"| someBinData | BinData-old | 1 | 20 |\n" +
"| someWeirdLegacyKey | String | 1 | 20 |\n" +
......@@ -9,6 +9,8 @@ import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.util.Arrays;
* Test, how variety handles objects, that are not named (for example objects inside array).
* It addresses behavior described in issue
......@@ -20,8 +22,14 @@ public class UnnamedObjectsAnalysisTest {
public void setUp() throws Exception {
this.variety = new Variety("test", "users");
variety.getSourceCollection().insert((DBObject) JSON.parse("{title:'Article 1', comments:[{author:'John', body:'it works', visible:true }]}"));
variety.getSourceCollection().insert((DBObject) JSON.parse("{title:'Article 2', comments:[{author:'Tom', body:'thanks'}]}"));
createDbObj("{title:'Article 1', comments:[{author:'John', body:'it works', visible:true }]}"),
createDbObj("{title:'Article 2', comments:[{author:'Tom', body:'thanks'}, {author:'Mark', body:1}]}")
private DBObject createDbObj(final String json) {
return (DBObject) JSON.parse(json);
......@@ -42,7 +50,7 @@ public class UnnamedObjectsAnalysisTest {
// unnamed objects are prefixed with .XX key
analysis.validate("", 2, 100, "String");
analysis.validate("comments.XX.body", 2, 100, "String");
analysis.validate("comments.XX.body", 2, 100, "String", "Number");
analysis.validate("comments.XX.visible", 1, 50, "Boolean");
......@@ -175,31 +175,40 @@ var serializeDoc = function(doc, maxDepth) {
return result;
var interimResults = {}; //hold results here until converted to final format
// main cursor
var numDocuments = 0;
db[collection].find($query).sort($sort).limit($limit).forEach(function(obj) {
var flattened = serializeDoc(obj, $maxDepth);
for (var key in flattened){
var value = flattened[key];
var mergeArrays = function(a, b) {
if(typeof a === 'undefined') {a = [];}
return a.concat(b) // merge two arrays into one, including duplications
.filter(function(item, pos, self){return self.indexOf(item) == pos;}) // remove duplications
.sort(); // sort alphabetically
// convert document to key-value map, where value is always an array with types as plain strings
var analyseDocument = function(document) {
var result = {};
for (var key in document) {
var value = document[key];
//translate unnamed object key from {_parent_name_}.{_index_} to {_parent_name_}.XX
key = key.replace(/\.\d+/g,'.XX');
result[key] = mergeArrays(result[key], varietyTypeOf(value));
return result;
var valueType = varietyTypeOf(value);
if(!(key in interimResults)){ //if it's a new key we haven't seen yet
interimResults[key] = {'types':[valueType],'totalOccurrences':1};
else{ //we've seen this key before
if(interimResults[key]['types'].indexOf(valueType) == -1) {
var interimResults = {}; //hold results here until converted to final format
var numDocuments = 0;
// main cursor
db[collection].find($query).sort($sort).limit($limit).forEach(function(obj) {
var docResult = analyseDocument(serializeDoc(obj, $maxDepth));
for (var key in docResult) {
if(key in interimResults) {
var existing = interimResults[key];
interimResults[key] = {'types':mergeArrays(docResult[key], existing.types),'totalOccurrences':existing.totalOccurrences + 1};
} else {
interimResults[key] = {'types':docResult[key],'totalOccurrences':1};
var varietyResults = [];
......@@ -221,13 +230,6 @@ var filter = function(item) {
var map = function(item) {
var keyName = item._id.key;
if(keyName.match(/\.XX/)) {
// exists query checks for embedded values for an array
// ie. match {arr:[{x:1}]} with {'arr.x':{$exists:true}}
// just need to pull out .XX in this case
keyName = keyName.replace(/.XX/g,'');
// we don't need to set it if limit isn't being used. (it's set above.)
if($limit < numDocuments) {
item.totalOccurrences = db[collection].count($query);
