Commit 3992d1cd by devttys0

Initial commit of rehash; some bug fixes / feature additions left to do in

parent 48a1a48b
#!/usr/bin/env python
import os
import re
import sys
import magic
import binwalk.hashmatch as hashmatch
from binwalk.compat import *
from getopt import GetoptError, gnu_getopt as GetOpt
def usage(fd):
fd.write("Usage: %s [OPTIONS] [FILE | DIR] [FILE | DIR] ...\n" % sys.argv[0])
def main():
results = []
options = []
arguments = []
file_list = []
types = {}
strings = False
symlinks = False
all_types = False
name = False
same = False
missing = False
cutoff = None
max_results = None
verbose = False
short_options = "c:hlmnSsvx:"
long_options = [
opts, args = GetOpt(sys.argv[1:], short_options, long_options)
except GetoptError as e:
sys.stderr.write("%s\n" % str(e))
for opt, arg in opts:
if opt in ("-h", "--help"):
elif opt in ("-S", "--strings"):
strings = True
elif opt in ("-l", "--symlinks"):
symlinks = True
elif opt in ("-n", "--name"):
name = True
elif opt in ("-s", "--show-same"):
same = True
elif opt in ("-m", "--show-missing"):
missing = True
elif opt in ("-x", "--max"):
max_results = int(arg, 0)
elif opt in ("-c", "--cutoff"):
cutoff = int(arg, 0)
elif opt in ("-v", "--verbose"):
verbose = True
# Keep track of the options and arguments.
# This is used later to determine which argv entries are file names.
options.append("%s%s" % (opt, arg))
options.append("%s=%s" % (opt, arg))
# Treat any command line options not processed by getopt as target file paths.
for opt in sys.argv[1:]:
if opt not in arguments and opt not in options and not opt.startswith('-'):
if len(file_list) >= 2:
rehash = hashmatch.HashMatch(cutoff=cutoff,
if os.path.isfile(file_list[0]):
if not all_types and len(types) == 0:
m =
file_type = m.file(file_list[0])
if file_type:
types[True] = re.escape(file_type.lower())
if os.path.isfile(file_list[1]):
results = rehash.files(file_list[0], file_list[1])
results = rehash.file(file_list[0], file_list[1:])
for f in file_list:
if not os.path.isdir(f):
print("Invalid usage")
results = rehash.directories(file_list[0], file_list[1])
for (match, fname) in results:
print("%s %s" % (match, fname))
if __name__ == "__main__":
......@@ -20,24 +20,24 @@ class HashMatch(object):
def __init__(self, cutoff=None, fuzzy=True, strings=False, same=False, missing=False, symlinks=False, name=False, matches={}, types={}):
def __init__(self, cutoff=None, strings=False, same=False, missing=False, symlinks=False, name=False, max_results=None, matches={}, types={}, verbose=False):
Class constructor.
@cutoff - The fuzzy cutoff which determines if files are different or not.
@fuzzy - Set to True to do fuzzy hashing; set to False to do traditional hashing.
@strings - Only hash strings inside of the file, not the entire file itself.
@same - Set to True to show files that are the same, False to show files that are different.
@missing - Set to True to show missing files.
@symlinks - Set to True to include symbolic link files.
@name - Set to True to only compare files whose base names match.
@max_results - Stop searching after x number of matches.
@matches - A dictionary of file names to diff.
@types - A dictionary of file types to diff.
@verbose - Enable verbose mode.
Returns None.
self.cutoff = cutoff
self.fuzzy = fuzzy
self.strings = strings
self.show_same = same
self.show_missing = missing
......@@ -45,6 +45,10 @@ class HashMatch(object):
self.matches = matches = name
self.types = types
self.max_results = max_results
self.verbose = verbose = 0
self.magic =
......@@ -58,9 +62,13 @@ class HashMatch(object):
self.types[k] = re.compile(self.types[k])
def _get_strings(self, fname):
return ''.join([string for (offset, string) in binwalk.smartstrings.FileStrings(fname, n=10).strings()])
return ''.join([string for (offset, string) in binwalk.smartstrings.FileStrings(fname, n=10, block=None).strings()])
def files(self, file1, file2):
def _print(self, message):
if self.verbose:
def _compare_files(self, file1, file2):
Fuzzy diff two files.
......@@ -73,7 +81,10 @@ class HashMatch(object):
status = 0
if not or os.path.basename(file1) == os.path.basename(file2):
if self.fuzzy:
if os.path.exists(file1) and os.path.exists(file2):
self._print("Checking %s -> %s" % (file1, file2))
hash1 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
hash2 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
......@@ -98,11 +109,7 @@ class HashMatch(object):
return self.lib.fuzzy_compare(hash1, hash2)
except Exception as e:
print "WARNING: Exception while performing fuzzy comparison:", e
elif not self.strings:
if file_md5(file1) == file_md5(file2):
return 100
print "WARNING: Exception while doing fuzzy hash:", e
return None
......@@ -111,7 +118,7 @@ class HashMatch(object):
Returns True if the match value is greater than or equal to the cutoff.
Returns False if the match value is less than the cutoff.
return (match >= self.cutoff)
return (match is not None and match >= self.cutoff)
def _get_file_list(self, directory):
......@@ -141,7 +148,11 @@ class HashMatch(object):
if self.types:
for f in files:
for (include, type_regex) in iterator(self.types):
magic_result = self.magic.file(f).lower()
magic_result = self.magic.file(os.path.join(directory, f)).lower()
except Exception as e:
magic_result = ''
match = type_regex.match(magic_result)
# If this matched an include filter, or didn't match an exclude filter
......@@ -162,6 +173,12 @@ class HashMatch(object):
return set(file_list)
def files(self, file1, file2):
m = self._compare_files(file1, file2)
if m is None:
m = 0
return [(m, file2)]
def file(self, fname, directories):
Search for a particular file in multiple directories.
......@@ -172,45 +189,57 @@ class HashMatch(object):
Returns a list of tuple results.
matching_files = [] = 0
for directory in directories:
for f in self._get_file_list(directory):
f = os.path.join(directory, f)
m = self.files(fname, f)
if self.is_match(m):
m = self._compare_files(fname, f)
if m is not None and self.is_match(m):
matching_files.append((m, f)) += 1
if self.max_results and >= self.max_results:
return matching_files
return matching_files
def directories(self, dir1, dir2):
def directories(self, source, dir_list):
Search two directories for matching files.
@dir1 - First directory.
@dir2 - Second directory.
@source - Source directory to compare everything to.
@dir_list - Compare files in source to files in these directories.
Returns a list of tuple results.
results = [] = 0
source_files = self._get_file_list(source)
dir1_files = self._get_file_list(dir1)
dir2_files = self._get_file_list(dir2)
for directory in dir_list:
dir_files = self._get_file_list(directory)
for f in dir1_files:
if f in dir2_files:
file1 = os.path.join(dir1, f)
file2 = os.path.join(dir2, f)
for f in source_files:
if f in dir_files:
file1 = os.path.join(source, f)
file2 = os.path.join(directory, f)
m = self.files(file1, file2)
m = self._compare_files(file1, file2)
if m is not None:
matches = self.is_match(m)
if (matches and self.show_same) or (not matches and not self.show_same):
results.append(("%3d" % m, f))
if self.show_missing:
results += [('---', f) for f in (dir1_files-dir2_files)]
results += [('+++', f) for f in (dir2_files-dir1_files)] += 1
if self.max_results and >= self.max_results:
return results
if self.show_missing and len(dir_list) == 1:
results += [('---', f) for f in (source_files-dir_files)]
results += [('+++', f) for f in (dir_files-source_files)]
return results
......@@ -218,7 +247,7 @@ class HashMatch(object):
if __name__ == '__main__':
import sys
hmatch = HashMatch(strings=True, name=True)
hmatch = HashMatch(strings=True, name=False, types={True:"^elf"})
print hmatch.file(sys.argv[1], sys.argv[2:])
#for (match, fname) in hmatch.directories(sys.argv[1], sys.argv[2]):
#for (match, fname) in hmatch.find_file(sys.argv[1], sys.argv[2:]):
......@@ -41,7 +41,7 @@ class FileStrings(object):
@length - The number of bytes in the file to analyze.
@offset - The starting offset into the file to begin analysis.
@n - The minimum valid string length.
@block - The block size to use when performing entropy analysis.
@block - The block size to use iwhen performing entropy analysis. Set to None to skip entropy analysis.
@algorithm - The entropy algorithm to use when performing entropy analysis.
@plugins - An instance of the Plugins class.
......@@ -59,22 +59,31 @@ class FileStrings(object):
self.valid_strings = []
self.external_validators = []
self.plugins = plugins
self.block = block
if not self.n:
self.n = self.MIN_STRING_LENGTH
if self.block is not None:
# Perform an entropy analysis over the entire file (anything less may generate poor entropy data).
# Give fake file results list to prevent FileEntropy from doing too much analysis.
with entropy.FileEntropy(file_name, block=block, file_results=['foo']) as e:
with entropy.FileEntropy(file_name, block=self.block, file_results=['foo']) as e:
(self.x, self.y, self.average_entropy) = e.analyze(algorithm=algorithm)
for i in range(0, len(self.x)):
self.entropy[self.x[i]] = self.y[i]
# Make sure our block size matches the entropy analysis's block size
self.block = e.block
# Make sure the starting offset is a multiple of the block size; else, when later checking
# the entropy analysis, block offsets won't line up.
self.start -= (self.start % self.block)
i = 0
self.block = common.BlockFile.READ_BLOCK_SIZE
# Fake the entropy scan
while i < common.file_size(file_name):
self.entropy[i] = 1.0
i += self.block
self.fd = common.BlockFile(file_name, 'r', length=length, offset=self.start)
# TODO: This is not optimal. We should read in larger chunks and process it into self.block chunks.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment