Commit 22114df4 by Wes Freeman

remove map reduce... experimental!

parent 5269155d
......@@ -104,7 +104,7 @@ I accept pull requests from forks. Very grateful to accept contributions from fo
#### Special Thanks ####
I offer sincere thanks to Wes Freeman (wfreeman) for contiued deveopment of Variety.
I offer sincere thanks to Wes Freeman (wfreeman) for continued development of Variety.
Additional special thanks to Gaëtan Voyer-Perraul ([@gatesvp] (https://twitter.com/#!/@gatesvp)) and Kristina Chodorow ([@kchodorow] (https://twitter.com/#!/kchodorow)) for answering other people's questions about how to do this on Stack Overflow, thereby providing me with the initial seed of code which grew into this tool.
......
......@@ -64,21 +64,6 @@ varietyCanHaveChildren = function (v) {
}
db.system.js.save( { _id : "varietyCanHaveChildren", value : varietyCanHaveChildren } );
varietyMapRecursive = function(parentKey, keys, level) {
for (var key in keys) {
var value = keys[key];
key = (parentKey + "." + key).replace(/\.\d+/g,'.XX');
emit({key : key}, {type: varietyTypeOf(value)});
if (level < maxDepth - 1 && varietyCanHaveChildren(value)) {
varietyMapRecursive(key, value, level + 1);
}
}
}
db.system.js.save({_id: "varietyMapRecursive", value: varietyMapRecursive});
varietyTypeOf = function(thing) {
if (typeof thing === "undefined") { throw "varietyTypeOf() requires an argument"; }
......@@ -117,53 +102,83 @@ varietyTypeOf = function(thing) {
}
db.system.js.save({_id: "varietyTypeOf", value: varietyTypeOf});
map = function() {
var keys = this;
for (var key in keys) {
var value = keys[key];
// Internally, Mongo uses keys like groceries.0, groceries.1, groceries.2 for
// items in an array. -JC
key = key.replace(/\.\d+/g,'.XX');
emit({key : key}, {type: varietyTypeOf(value)});
var varietyResults = {};
var countResults = 0;
if (varietyCanHaveChildren(value) && maxDepth > 1) {
varietyMapRecursive(key, value, 1);
var addTypeToArray = function(arr, value) {
var t = varietyTypeOf(value);
var found = false;
for(var i=0; i< arr.length; i++) {
if(arr[i] == t) {
found = true;
break;
}
}
if(!found) {
arr.push(t);
}
}
reduce = function(key, values){
var types = [];
values.forEach(function(value) {
if(types.indexOf(value.type) === -1) {
// i.e. "if 'types' does not already have 'value.type', then insert it
// into 'types'." -JC
types.push(value.type);
var addVarietyResult = function(key, value) {
cur = varietyResults[key];
if(cur == null) {
varietyResults[key] = {"_id":{"key":key},"value": {"type": varietyTypeOf(value)}, totalOccurrences:1};
} else {
var type = varietyTypeOf(value);
if(cur.value.type != type) {
cur.value.types = [cur.value.type];
delete cur.value["type"];
addTypeToArray(cur.value.types, type);
} else if(!cur.value.type) {
addTypeToArray(cur.value.types, type);
}
cur.totalOccurrences++;
varietyResults[key] = cur;
}
});
return { types: types };
}
var resultsCollectionName = collection + "Keys";
var mapRecursive = function(parentKey, obj, level){
for (var key in obj) {
if(obj.hasOwnProperty(key)) {
var value = obj[key];
key = (parentKey + "." + key).replace(/\.\d+/g,'.XX');
addVarietyResult(key, value);
if (level < maxDepth - 1 && varietyCanHaveChildren(value)) {
mapRecursive(key, value, level + 1);
}
}
}
}
db[collection].mapReduce(map, reduce, {
out: {
replace : resultsCollectionName,
db : "varietyResults"},
limit : limit,
sort : {_id: -1},
scope : { limit : limit, maxDepth:maxDepth }});
// main cursor
db[collection].find().forEach(function(obj) {
countResults++;
for (var key in obj) {
if(obj.hasOwnProperty(key)) {
var value = obj[key];
addVarietyResult(key, value);
if (maxDepth > 1 && varietyCanHaveChildren(value)) {
mapRecursive(key, value, 1);
}
}
}
});
var resultsDB = db.getMongo().getDB("varietyResults");
var resultsCollectionName = collection + "Keys";
// replace results collection
print("creating results collection: "+resultsCollectionName);
resultsDB[resultsCollectionName].drop();
for(result in varietyResults) {
resultsDB[resultsCollectionName].insert(varietyResults[result]);
}
var numDocuments = db[collection].count();
print("removing leaf arrays in results collection, and getting percentages");
resultsDB[resultsCollectionName].find({}).forEach(function(key) {
keyName = key["_id"].key;
var keyName = key["_id"].key;
// We throw away keys which end in an array index, since they are not useful
// for our analysis. (We still keep the key of their parent array, though.) -JC
......@@ -178,18 +193,11 @@ resultsDB[resultsCollectionName].find({}).forEach(function(key) {
// just need to pull out .XX in this case
keyName = keyName.replace(/.XX/g,"");
}
var existsQuery = {};
existsQuery[keyName] = {$exists: true};
key.totalOccurrences = db[collection].count(existsQuery);
key.percentContaining = (key.totalOccurrences / numDocuments) * 100;
resultsDB[resultsCollectionName].save(key);
});
var sortedKeys = resultsDB[resultsCollectionName].find({}).sort({totalOccurrences: -1});
sortedKeys.forEach(function(key) {
print(tojson(key, '', true));
});
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment