Commit 18ec4c82 by Tomas Dvorak

Merge pull request #104 from objectrocket/master

Enumerate type counts for multiple types; enable secondary reads
parents 29d5e3ae 2313e1c2
...@@ -7,7 +7,7 @@ This lightweight tool helps you get a sense of your application's schema, as wel ...@@ -7,7 +7,7 @@ This lightweight tool helps you get a sense of your application's schema, as wel
_“I happen to slowly be falling in love with Variety! It is actually one of the most useful tools to get a sense for a messy/unknown data set, and I have put it in a few of our exercises at Zipfian Academy.”_ _“I happen to slowly be falling in love with Variety! It is actually one of the most useful tools to get a sense for a messy/unknown data set, and I have put it in a few of our exercises at Zipfian Academy.”_
Jon Dinu Jon Dinu
_Co-founder of [Zipfian Academy](http://www.zipfianacademy.com/)_ _Co-founder of [Zipfian Academy](http://www.zipfianacademy.com/)_
*** ***
...@@ -28,23 +28,23 @@ So, let's see what we've got here: ...@@ -28,23 +28,23 @@ So, let's see what we've got here:
$ mongo test --eval "var collection = 'users'" variety.js $ mongo test --eval "var collection = 'users'" variety.js
+------------------------------------------------------------+ +------------------------------------------------------------------+
| key | types | occurrences | percents | | key | types | occurrences | percents |
| ------------------ | ------------ | ----------- | -------- | | ------------------ | ------------ | ----------- | -------- |
| _id | ObjectId | 5 | 100.0 | | _id | ObjectId | 5 | 100.0 |
| name | String | 5 | 100.0 | | name | String | 5 | 100.0 |
| bio | String | 3 | 60.0 | | bio | String | 3 | 60.0 |
| birthday | String | 2 | 40.0 | | birthday | String | 2 | 40.0 |
| pets | Array,String | 2 | 40.0 | | pets | Array(4),String(1) | 5 | 40.0 |
| someBinData | BinData-old | 1 | 20.0 | | someBinData | BinData-old | 1 | 20.0 |
| someWeirdLegacyKey | String | 1 | 20.0 | | someWeirdLegacyKey | String | 1 | 20.0 |
+------------------------------------------------------------+ +------------------------------------------------------------------+
_("test" is the database containing the collection we are analyzing.)_ _("test" is the database containing the collection we are analyzing.)_
Hmm. Looks like everybody has a "name" and "_id". Most, but not all have a "bio". Hmm. Looks like everybody has a "name" and "_id". Most, but not all have a "bio".
Interestingly, it looks like "pets" can be either an array or a string. Will this cause any problems in the application, I wonder? Interestingly, it looks like "pets" can be either an array or a string, but there are more arrays than strings. Will this cause any problems in the application, I wonder?
Seems like the first document created has a weird legacy key—those damn fools who built the prototype didn't clean up after themselves. If there were a thousand such early documents, I might cross-reference the codebase to confirm they are no longer used, and then delete them all. That way they'll not confuse any future developers. Seems like the first document created has a weird legacy key—those damn fools who built the prototype didn't clean up after themselves. If there were a thousand such early documents, I might cross-reference the codebase to confirm they are no longer used, and then delete them all. That way they'll not confuse any future developers.
...@@ -148,6 +148,12 @@ Variety can also read that option and mute unnecessary output. This is useful in ...@@ -148,6 +148,12 @@ Variety can also read that option and mute unnecessary output. This is useful in
$ mongo test --quiet --eval "var collection = 'users', sort = { updated_at : -1 }" variety.js $ mongo test --quiet --eval "var collection = 'users', sort = { updated_at : -1 }" variety.js
#### Secondary Reads ####
Analyzing a large collection on a busy replica set primary could take a lot longer than if you read from a secondary. To do so, we have to tell MongoDB it's okay to perform secondary reads
by setting the ```slaveOk``` property to ```true```:
$ mongo secondary.replicaset.member:31337/somedb --eval "var collection = 'users', slaveOk = true" variety.js
### Save Results in MongoDB For Future Use ### ### Save Results in MongoDB For Future Use ###
By default, Variety prints results only to standard output and does not store them in MongoDB itself. If you want to persist them automatically in database for later usage, you can set the parameter ```persistResults```. By default, Variety prints results only to standard output and does not store them in MongoDB itself. If you want to persist them automatically in database for later usage, you can set the parameter ```persistResults```.
Variety then stores result documents in database ```varietyResults``` and the collection name is derived from the source collection's name. Variety then stores result documents in database ```varietyResults``` and the collection name is derived from the source collection's name.
......
...@@ -23,6 +23,12 @@ log('Version 1.5.0, released 14 May 2015'); ...@@ -23,6 +23,12 @@ log('Version 1.5.0, released 14 May 2015');
var dbs = []; var dbs = [];
var emptyDbs = []; var emptyDbs = [];
if (typeof slaveOk !== 'undefined') {
if (slaveOk === true) {
db.getMongo().setSlaveOk();
}
}
var knownDatabases = db.adminCommand('listDatabases').databases; var knownDatabases = db.adminCommand('listDatabases').databases;
if(typeof knownDatabases !== 'undefined') { // not authorized user receives error response (json) without databases key if(typeof knownDatabases !== 'undefined') { // not authorized user receives error response (json) without databases key
knownDatabases.forEach(function(d){ knownDatabases.forEach(function(d){
...@@ -214,23 +220,33 @@ var mergeDocument = function(docResult, interimResults) { ...@@ -214,23 +220,33 @@ var mergeDocument = function(docResult, interimResults) {
for (var key in docResult) { for (var key in docResult) {
if(key in interimResults) { if(key in interimResults) {
var existing = interimResults[key]; var existing = interimResults[key];
for(var type in docResult[key]) { for(var type in docResult[key]) {
existing.types[type] = true; if (type in existing.types) {
existing.types[type] = existing.types[type] + 1;
} else {
existing.types[type] = 1;
}
} }
existing.totalOccurrences = existing.totalOccurrences + 1; existing.totalOccurrences = existing.totalOccurrences + 1;
} else { } else {
interimResults[key] = {'types':docResult[key],'totalOccurrences':1}; var types = {};
for (var newType in docResult[key]) {
types[newType] = 1;
}
interimResults[key] = {'types': types,'totalOccurrences':1};
} }
} }
}; };
var convertResults = function(interimResults, documentsCount) { var convertResults = function(interimResults, documentsCount) {
var getKeys = function(obj) { var getKeys = function(obj) {
var keys = []; var keys = {};
for(var key in obj) { for(var key in obj) {
keys.push(key); keys[key] = obj[key];
} }
return keys.sort(); return keys;
//return keys.sort();
}; };
var varietyResults = []; var varietyResults = [];
//now convert the interimResults into the proper format //now convert the interimResults into the proper format
...@@ -301,7 +317,18 @@ var createAsciiTable = function(results) { ...@@ -301,7 +317,18 @@ var createAsciiTable = function(results) {
var maxDigits = varietyResults.map(function(value){return significantDigits(value.percentContaining);}).reduce(function(acc,val){return acc>val?acc:val;}); var maxDigits = varietyResults.map(function(value){return significantDigits(value.percentContaining);}).reduce(function(acc,val){return acc>val?acc:val;});
var rows = results.map(function(row) { var rows = results.map(function(row) {
return [row._id.key, row.value.types, row.totalOccurrences, row.percentContaining.toFixed(maxDigits)]; var types = [];
var typeKeys = Object.keys(row.value.types);
if (typeKeys.length > 1) {
for (var type in row.value.types) {
var typestring = type + ' (' + row.value.types[type] + ')';
types.push(typestring);
}
} else {
types = typeKeys;
}
return [row._id.key, types, row.totalOccurrences, row.percentContaining.toFixed(maxDigits)];
}); });
var table = [headers, headers.map(function(){return '';})].concat(rows); var table = [headers, headers.map(function(){return '';})].concat(rows);
var colMaxWidth = function(arr, index) {return Math.max.apply(null, arr.map(function(row){return row[index].toString().length;}));}; var colMaxWidth = function(arr, index) {return Math.max.apply(null, arr.map(function(row){return row[index].toString().length;}));};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment