Commit 1927a515 by Jacob Heller

refactored the code substantially to fix the regression from 'types' to 'type'…

refactored the code substantially to fix the regression from 'types' to 'type' in the results.  Added some comments and the code is a little shorter too
parent 3c0521d1
...@@ -63,17 +63,7 @@ print("Using maxDepth of " + maxDepth); ...@@ -63,17 +63,7 @@ print("Using maxDepth of " + maxDepth);
if (typeof sort === "undefined") { var sort = {_id: -1}; } if (typeof sort === "undefined") { var sort = {_id: -1}; }
print("Using sort of " + tojson(sort)); print("Using sort of " + tojson(sort));
varietyCanHaveChildren = function (v) {
var isArray = v &&
typeof v === 'object' &&
typeof v.length === 'number' &&
!(v.propertyIsEnumerable('length'));
var isObject = typeof v === 'object';
var specialObject = v instanceof Date ||
v instanceof ObjectId ||
v instanceof BinData;
return !specialObject && (isArray || isObject);
};
varietyTypeOf = function(thing) { varietyTypeOf = function(thing) {
if (typeof thing === "undefined") { throw "varietyTypeOf() requires an argument"; } if (typeof thing === "undefined") { throw "varietyTypeOf() requires an argument"; }
...@@ -112,96 +102,76 @@ varietyTypeOf = function(thing) { ...@@ -112,96 +102,76 @@ varietyTypeOf = function(thing) {
} }
}; };
var addTypeToArray = function(arr, value) { //flattens object keys to 1D. i.e. {'key1':1,{'key2':{'key3':2}}} becomes {'key1':1,'key2.key3':2}
var t = varietyTypeOf(value); //we assume no '.' characters in the keys, which is an OK assumption for MongoDB
var found = false; function serializeDoc(doc, maxDepth){
for(var i=0; i< arr.length; i++) { var result = {};
if(arr[i] === t) {
found = true; //determining if an object is a Hash vs Array vs something else is hard
break; function isHash(v) {
} var isArray = Array.isArray(v);
} var isObject = typeof v === 'object';
if(!found) { var specialObject = v instanceof Date ||
arr.push(t); v instanceof ObjectId ||
} v instanceof BinData;
}; return !specialObject && !isArray && isObject;
};
var addRecordResult = function(key, value, result) {
cur = result[key]; function serialize(document, parentKey, maxDepth){
if(!cur) { for(var key in document){
result[key] = {"_id":{"key":key},"value": {"type": varietyTypeOf(value)}, totalOccurrences:1}; //skip over inherited properties such as string, length, etch
} else { if(!(document.hasOwnProperty(key)))
var type = varietyTypeOf(value); continue
if(cur.value.type !== type) { var value = document[key];
cur.value.types = [cur.value.type]; //objects are skipped here and recursed into later
delete cur.value.type; //if(typeof value != "object")
addTypeToArray(cur.value.types, type); result[parentKey+key] = value;
} else if(!cur.value.type) { //it's an object, recurse...only if we haven't reached max depth
addTypeToArray(cur.value.types, type); if(isHash(value) && (maxDepth > 0)){
} serialize(value, parentKey+key+".",maxDepth-1);
result[key] = cur; }
} }
}; }
serialize(doc, "", maxDepth)
var mapRecursive = function(parentKey, obj, level, result) { return result
for (var key in obj) { }
if(obj.hasOwnProperty(key)) {
var value = obj[key];
key = (parentKey + "." + key).replace(/\.\d+/g,'.XX');
addRecordResult(key, value, result);
if (level < maxDepth - 1 && varietyCanHaveChildren(value)) {
mapRecursive(key, value, level + 1, result);
}
}
}
};
// store results here (no map reduce limit!)
var varietyResults = {};
var addVarietyResults = function(result) {
for(var key in result) {
if(result.hasOwnProperty(key)) {
cur = varietyResults[key];
var value = result[key];
if(!cur) {
varietyResults[key] = value;
} else {
if(value.type && value.type === cur.value.type) {
} else {
for(var type in value.types) {
if(cur.value.type !== type) {
cur.value.types = [cur.value.type];
delete cur.value.type;
addTypeToArray(cur.value.types, type);
} else if(!cur.value.type) {
addTypeToArray(cur.value.types, type);
}
}
}
cur.totalOccurrences++;
varietyResults[key] = cur;
}
}
}
};
var interimResults = {} //hold results here until converted to final format
// main cursor // main cursor
db[collection].find(query).sort(sort).limit(limit).forEach(function(obj) { db[collection].find(query).sort(sort).limit(limit).forEach(function(obj) {
var recordResult = {}; //printjson(obj)
for (var key in obj) { flattened = serializeDoc(obj, maxDepth);
if(obj.hasOwnProperty(key)) { //printjson(flattened)
var value = obj[key]; for (key in flattened){
addRecordResult(key, value, recordResult); var value = flattened[key];
if (maxDepth > 1 && varietyCanHaveChildren(value)) { var valueType = varietyTypeOf(value);
mapRecursive(key, value, 1, recordResult); if(!(key in interimResults)){ //if it's a new key we haven't seen yet
} //for the moment, store 'types' as a dictionary. An easy way to prevent duplicates
} var newEntry = {'types':{},'totalOccurrences':1};
} newEntry['types'][valueType] = true;
addVarietyResults(recordResult); interimResults[key] = newEntry;
}
else{ //we've seen this key before
interimResults[key]['types'][valueType] = true;
interimResults[key]['totalOccurrences']++;
}
}
}); });
var varietyResults = {};
//now convert the interimResults into the proper format
for(key in interimResults){
var entry = interimResults[key];
var newEntry = {};
newEntry['_id'] = {'key':key};
newEntry['value'] = {'types':Object.keys(entry['types'])};
newEntry['totalOccurrences'] = entry['totalOccurrences'];
newEntry['percentContaining'] = entry['totalOccurrences']*100/limit;
varietyResults[key] = newEntry;
}
var resultsDB = db.getMongo().getDB("varietyResults"); var resultsDB = db.getMongo().getDB("varietyResults");
var resultsCollectionName = collection + "Keys"; var resultsCollectionName = collection + "Keys";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment