Commit 22114df4 by Wes Freeman

remove map reduce... experimental!

parent 5269155d
...@@ -104,7 +104,7 @@ I accept pull requests from forks. Very grateful to accept contributions from fo ...@@ -104,7 +104,7 @@ I accept pull requests from forks. Very grateful to accept contributions from fo
#### Special Thanks #### #### Special Thanks ####
I offer sincere thanks to Wes Freeman (wfreeman) for contiued deveopment of Variety. I offer sincere thanks to Wes Freeman (wfreeman) for continued development of Variety.
Additional special thanks to Gaëtan Voyer-Perraul ([@gatesvp] (https://twitter.com/#!/@gatesvp)) and Kristina Chodorow ([@kchodorow] (https://twitter.com/#!/kchodorow)) for answering other people's questions about how to do this on Stack Overflow, thereby providing me with the initial seed of code which grew into this tool. Additional special thanks to Gaëtan Voyer-Perraul ([@gatesvp] (https://twitter.com/#!/@gatesvp)) and Kristina Chodorow ([@kchodorow] (https://twitter.com/#!/kchodorow)) for answering other people's questions about how to do this on Stack Overflow, thereby providing me with the initial seed of code which grew into this tool.
......
...@@ -64,21 +64,6 @@ varietyCanHaveChildren = function (v) { ...@@ -64,21 +64,6 @@ varietyCanHaveChildren = function (v) {
} }
db.system.js.save( { _id : "varietyCanHaveChildren", value : varietyCanHaveChildren } ); db.system.js.save( { _id : "varietyCanHaveChildren", value : varietyCanHaveChildren } );
varietyMapRecursive = function(parentKey, keys, level) {
for (var key in keys) {
var value = keys[key];
key = (parentKey + "." + key).replace(/\.\d+/g,'.XX');
emit({key : key}, {type: varietyTypeOf(value)});
if (level < maxDepth - 1 && varietyCanHaveChildren(value)) {
varietyMapRecursive(key, value, level + 1);
}
}
}
db.system.js.save({_id: "varietyMapRecursive", value: varietyMapRecursive});
varietyTypeOf = function(thing) { varietyTypeOf = function(thing) {
if (typeof thing === "undefined") { throw "varietyTypeOf() requires an argument"; } if (typeof thing === "undefined") { throw "varietyTypeOf() requires an argument"; }
...@@ -117,53 +102,83 @@ varietyTypeOf = function(thing) { ...@@ -117,53 +102,83 @@ varietyTypeOf = function(thing) {
} }
db.system.js.save({_id: "varietyTypeOf", value: varietyTypeOf}); db.system.js.save({_id: "varietyTypeOf", value: varietyTypeOf});
map = function() { var varietyResults = {};
var keys = this; var countResults = 0;
for (var key in keys) {
var value = keys[key];
// Internally, Mongo uses keys like groceries.0, groceries.1, groceries.2 for
// items in an array. -JC
key = key.replace(/\.\d+/g,'.XX');
emit({key : key}, {type: varietyTypeOf(value)});
if (varietyCanHaveChildren(value) && maxDepth > 1) { var addTypeToArray = function(arr, value) {
varietyMapRecursive(key, value, 1); var t = varietyTypeOf(value);
var found = false;
for(var i=0; i< arr.length; i++) {
if(arr[i] == t) {
found = true;
break;
} }
} }
if(!found) {
arr.push(t);
}
} }
reduce = function(key, values){ var addVarietyResult = function(key, value) {
var types = []; cur = varietyResults[key];
values.forEach(function(value) { if(cur == null) {
if(types.indexOf(value.type) === -1) { varietyResults[key] = {"_id":{"key":key},"value": {"type": varietyTypeOf(value)}, totalOccurrences:1};
// i.e. "if 'types' does not already have 'value.type', then insert it } else {
// into 'types'." -JC var type = varietyTypeOf(value);
types.push(value.type); if(cur.value.type != type) {
cur.value.types = [cur.value.type];
delete cur.value["type"];
addTypeToArray(cur.value.types, type);
} else if(!cur.value.type) {
addTypeToArray(cur.value.types, type);
}
cur.totalOccurrences++;
varietyResults[key] = cur;
} }
});
return { types: types };
} }
var resultsCollectionName = collection + "Keys"; var mapRecursive = function(parentKey, obj, level){
for (var key in obj) {
if(obj.hasOwnProperty(key)) {
var value = obj[key];
key = (parentKey + "." + key).replace(/\.\d+/g,'.XX');
addVarietyResult(key, value);
if (level < maxDepth - 1 && varietyCanHaveChildren(value)) {
mapRecursive(key, value, level + 1);
}
}
}
}
db[collection].mapReduce(map, reduce, { // main cursor
out: { db[collection].find().forEach(function(obj) {
replace : resultsCollectionName, countResults++;
db : "varietyResults"}, for (var key in obj) {
limit : limit, if(obj.hasOwnProperty(key)) {
sort : {_id: -1}, var value = obj[key];
scope : { limit : limit, maxDepth:maxDepth }}); addVarietyResult(key, value);
if (maxDepth > 1 && varietyCanHaveChildren(value)) {
mapRecursive(key, value, 1);
}
}
}
});
var resultsDB = db.getMongo().getDB("varietyResults"); var resultsDB = db.getMongo().getDB("varietyResults");
var resultsCollectionName = collection + "Keys";
// replace results collection
print("creating results collection: "+resultsCollectionName);
resultsDB[resultsCollectionName].drop();
for(result in varietyResults) {
resultsDB[resultsCollectionName].insert(varietyResults[result]);
}
var numDocuments = db[collection].count(); var numDocuments = db[collection].count();
print("removing leaf arrays in results collection, and getting percentages");
resultsDB[resultsCollectionName].find({}).forEach(function(key) { resultsDB[resultsCollectionName].find({}).forEach(function(key) {
keyName = key["_id"].key; var keyName = key["_id"].key;
// We throw away keys which end in an array index, since they are not useful // We throw away keys which end in an array index, since they are not useful
// for our analysis. (We still keep the key of their parent array, though.) -JC // for our analysis. (We still keep the key of their parent array, though.) -JC
...@@ -178,18 +193,11 @@ resultsDB[resultsCollectionName].find({}).forEach(function(key) { ...@@ -178,18 +193,11 @@ resultsDB[resultsCollectionName].find({}).forEach(function(key) {
// just need to pull out .XX in this case // just need to pull out .XX in this case
keyName = keyName.replace(/.XX/g,""); keyName = keyName.replace(/.XX/g,"");
} }
var existsQuery = {};
existsQuery[keyName] = {$exists: true};
key.totalOccurrences = db[collection].count(existsQuery);
key.percentContaining = (key.totalOccurrences / numDocuments) * 100; key.percentContaining = (key.totalOccurrences / numDocuments) * 100;
resultsDB[resultsCollectionName].save(key); resultsDB[resultsCollectionName].save(key);
}); });
var sortedKeys = resultsDB[resultsCollectionName].find({}).sort({totalOccurrences: -1}); var sortedKeys = resultsDB[resultsCollectionName].find({}).sort({totalOccurrences: -1});
sortedKeys.forEach(function(key) { sortedKeys.forEach(function(key) {
print(tojson(key, '', true)); print(tojson(key, '', true));
}); });
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment