Commit 2acff789 by Wes Freeman

Merge pull request #11 from wfreeman/master

Removing map reduce
parents b9459c8a b80f0fef
...@@ -62,23 +62,7 @@ varietyCanHaveChildren = function (v) { ...@@ -62,23 +62,7 @@ varietyCanHaveChildren = function (v) {
v instanceof BinData; v instanceof BinData;
return !specialObject && (isArray || isObject); return !specialObject && (isArray || isObject);
} }
db.system.js.save( { _id : "varietyCanHaveChildren", value : varietyCanHaveChildren } );
varietyMapRecursive = function(parentKey, keys, level) {
for (var key in keys) {
var value = keys[key];
key = (parentKey + "." + key).replace(/\.\d+/g,'.XX');
emit({key : key}, {type: varietyTypeOf(value)});
if (level < maxDepth - 1 && varietyCanHaveChildren(value)) {
varietyMapRecursive(key, value, level + 1);
}
}
}
db.system.js.save({_id: "varietyMapRecursive", value: varietyMapRecursive});
varietyTypeOf = function(thing) { varietyTypeOf = function(thing) {
if (typeof thing === "undefined") { throw "varietyTypeOf() requires an argument"; } if (typeof thing === "undefined") { throw "varietyTypeOf() requires an argument"; }
...@@ -115,55 +99,83 @@ varietyTypeOf = function(thing) { ...@@ -115,55 +99,83 @@ varietyTypeOf = function(thing) {
} }
} }
} }
db.system.js.save({_id: "varietyTypeOf", value: varietyTypeOf});
map = function() { // store results here (no map reduce limit!)
var keys = this; var varietyResults = {};
for (var key in keys) { var addTypeToArray = function(arr, value) {
var value = keys[key]; var t = varietyTypeOf(value);
var found = false;
// Internally, Mongo uses keys like groceries.0, groceries.1, groceries.2 for for(var i=0; i< arr.length; i++) {
// items in an array. -JC if(arr[i] == t) {
key = key.replace(/\.\d+/g,'.XX'); found = true;
break;
emit({key : key}, {type: varietyTypeOf(value)});
if (varietyCanHaveChildren(value) && maxDepth > 1) {
varietyMapRecursive(key, value, 1);
} }
} }
if(!found) {
arr.push(t);
}
} }
reduce = function(key, values){ var addVarietyResult = function(key, value) {
var types = []; cur = varietyResults[key];
values.forEach(function(value) { if(cur == null) {
if(types.indexOf(value.type) === -1) { varietyResults[key] = {"_id":{"key":key},"value": {"type": varietyTypeOf(value)}, totalOccurrences:1};
// i.e. "if 'types' does not already have 'value.type', then insert it } else {
// into 'types'." -JC var type = varietyTypeOf(value);
types.push(value.type); if(cur.value.type != type) {
cur.value.types = [cur.value.type];
delete cur.value["type"];
addTypeToArray(cur.value.types, type);
} else if(!cur.value.type) {
addTypeToArray(cur.value.types, type);
} }
}); cur.totalOccurrences++;
varietyResults[key] = cur;
return { types: types }; }
} }
var resultsCollectionName = collection + "Keys"; var mapRecursive = function(parentKey, obj, level){
for (var key in obj) {
if(obj.hasOwnProperty(key)) {
var value = obj[key];
key = (parentKey + "." + key).replace(/\.\d+/g,'.XX');
addVarietyResult(key, value);
if (level < maxDepth - 1 && varietyCanHaveChildren(value)) {
mapRecursive(key, value, level + 1);
}
}
}
}
db[collection].mapReduce(map, reduce, { // main cursor
out: { db[collection].find().sort({_id: -1}).limit(limit).forEach(function(obj) {
replace : resultsCollectionName, for (var key in obj) {
db : "varietyResults"}, if(obj.hasOwnProperty(key)) {
limit : limit, var value = obj[key];
sort : {_id: -1}, addVarietyResult(key, value);
scope : { limit : limit, maxDepth:maxDepth }}); if (maxDepth > 1 && varietyCanHaveChildren(value)) {
mapRecursive(key, value, 1);
}
}
}
});
var resultsDB = db.getMongo().getDB("varietyResults"); var resultsDB = db.getMongo().getDB("varietyResults");
var resultsCollectionName = collection + "Keys";
// replace results collection
print("creating results collection: "+resultsCollectionName);
resultsDB[resultsCollectionName].drop();
for(result in varietyResults) {
resultsDB[resultsCollectionName].insert(varietyResults[result]);
}
var numDocuments = db[collection].count(); var numDocuments = db[collection].count();
print("removing leaf arrays in results collection, and getting percentages");
resultsDB[resultsCollectionName].find({}).forEach(function(key) { resultsDB[resultsCollectionName].find({}).forEach(function(key) {
keyName = key["_id"].key; var keyName = key["_id"].key;
// We throw away keys which end in an array index, since they are not useful // We throw away keys which end in an array index, since they are not useful
// for our analysis. (We still keep the key of their parent array, though.) -JC // for our analysis. (We still keep the key of their parent array, though.) -JC
...@@ -178,18 +190,17 @@ resultsDB[resultsCollectionName].find({}).forEach(function(key) { ...@@ -178,18 +190,17 @@ resultsDB[resultsCollectionName].find({}).forEach(function(key) {
// just need to pull out .XX in this case // just need to pull out .XX in this case
keyName = keyName.replace(/.XX/g,""); keyName = keyName.replace(/.XX/g,"");
} }
var existsQuery = {}; // we don't need to set it if limit isn't being used. (it's set above.)
existsQuery[keyName] = {$exists: true}; if(limit < numDocuments) {
var existsQuery = {};
key.totalOccurrences = db[collection].count(existsQuery); existsQuery[keyName] = {$exists: true};
key.totalOccurrences = db[collection].count(existsQuery);
}
key.percentContaining = (key.totalOccurrences / numDocuments) * 100; key.percentContaining = (key.totalOccurrences / numDocuments) * 100;
resultsDB[resultsCollectionName].save(key); resultsDB[resultsCollectionName].save(key);
}); });
var sortedKeys = resultsDB[resultsCollectionName].find({}).sort({totalOccurrences: -1}); var sortedKeys = resultsDB[resultsCollectionName].find({}).sort({totalOccurrences: -1});
sortedKeys.forEach(function(key) { sortedKeys.forEach(function(key) {
print(tojson(key, '', true)); print(tojson(key, '', true));
}); });
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment