Commit 3992d1cd by devttys0

Initial commit of rehash; some bug fixes / feature additions left to do in hashmatch.py

parent 48a1a48b
#!/usr/bin/env python
import os
import re
import sys
import magic
import binwalk.hashmatch as hashmatch
from binwalk.compat import *
from getopt import GetoptError, gnu_getopt as GetOpt
def usage(fd):
fd.write("Usage: %s [OPTIONS] [FILE | DIR] [FILE | DIR] ...\n" % sys.argv[0])
def main():
results = []
options = []
arguments = []
file_list = []
types = {}
strings = False
symlinks = False
all_types = False
name = False
same = False
missing = False
cutoff = None
max_results = None
verbose = False
short_options = "c:hlmnSsvx:"
long_options = [
"help",
"cutoff=",
"strings",
"show-same",
"show-missing",
"max=",
"symlinks",
"name",
"file-type",
"file-name",
"verbose",
]
try:
opts, args = GetOpt(sys.argv[1:], short_options, long_options)
except GetoptError as e:
sys.stderr.write("%s\n" % str(e))
usage(sys.stderr)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage(sys.stdout)
elif opt in ("-S", "--strings"):
strings = True
elif opt in ("-l", "--symlinks"):
symlinks = True
elif opt in ("-n", "--name"):
name = True
elif opt in ("-s", "--show-same"):
same = True
elif opt in ("-m", "--show-missing"):
missing = True
elif opt in ("-x", "--max"):
max_results = int(arg, 0)
elif opt in ("-c", "--cutoff"):
cutoff = int(arg, 0)
elif opt in ("-v", "--verbose"):
verbose = True
# Keep track of the options and arguments.
# This is used later to determine which argv entries are file names.
options.append(opt)
options.append("%s%s" % (opt, arg))
options.append("%s=%s" % (opt, arg))
arguments.append(arg)
# Treat any command line options not processed by getopt as target file paths.
for opt in sys.argv[1:]:
if opt not in arguments and opt not in options and not opt.startswith('-'):
file_list.append(opt)
if len(file_list) >= 2:
rehash = hashmatch.HashMatch(cutoff=cutoff,
strings=strings,
symlinks=symlinks,
name=name,
same=same,
missing=missing,
max_results=max_results,
verbose=verbose)
if os.path.isfile(file_list[0]):
if not all_types and len(types) == 0:
m = magic.open(0)
m.load()
file_type = m.file(file_list[0])
if file_type:
types[True] = re.escape(file_type.lower())
if os.path.isfile(file_list[1]):
results = rehash.files(file_list[0], file_list[1])
else:
results = rehash.file(file_list[0], file_list[1:])
else:
for f in file_list:
if not os.path.isdir(f):
print("Invalid usage")
usage(sys.stderr)
results = rehash.directories(file_list[0], file_list[1])
for (match, fname) in results:
print("%s %s" % (match, fname))
if __name__ == "__main__":
main()
...@@ -20,24 +20,24 @@ class HashMatch(object): ...@@ -20,24 +20,24 @@ class HashMatch(object):
FUZZY_DEFAULT_CUTOFF = 50 FUZZY_DEFAULT_CUTOFF = 50
def __init__(self, cutoff=None, fuzzy=True, strings=False, same=False, missing=False, symlinks=False, name=False, matches={}, types={}): def __init__(self, cutoff=None, strings=False, same=False, missing=False, symlinks=False, name=False, max_results=None, matches={}, types={}, verbose=False):
''' '''
Class constructor. Class constructor.
@cutoff - The fuzzy cutoff which determines if files are different or not. @cutoff - The fuzzy cutoff which determines if files are different or not.
@fuzzy - Set to True to do fuzzy hashing; set to False to do traditional hashing.
@strings - Only hash strings inside of the file, not the entire file itself. @strings - Only hash strings inside of the file, not the entire file itself.
@same - Set to True to show files that are the same, False to show files that are different. @same - Set to True to show files that are the same, False to show files that are different.
@missing - Set to True to show missing files. @missing - Set to True to show missing files.
@symlinks - Set to True to include symbolic link files. @symlinks - Set to True to include symbolic link files.
@name - Set to True to only compare files whose base names match. @name - Set to True to only compare files whose base names match.
@max_results - Stop searching after x number of matches.
@matches - A dictionary of file names to diff. @matches - A dictionary of file names to diff.
@types - A dictionary of file types to diff. @types - A dictionary of file types to diff.
@verbose - Enable verbose mode.
Returns None. Returns None.
''' '''
self.cutoff = cutoff self.cutoff = cutoff
self.fuzzy = fuzzy
self.strings = strings self.strings = strings
self.show_same = same self.show_same = same
self.show_missing = missing self.show_missing = missing
...@@ -45,6 +45,10 @@ class HashMatch(object): ...@@ -45,6 +45,10 @@ class HashMatch(object):
self.matches = matches self.matches = matches
self.name = name self.name = name
self.types = types self.types = types
self.max_results = max_results
self.verbose = verbose
self.total = 0
self.magic = magic.open(0) self.magic = magic.open(0)
self.magic.load() self.magic.load()
...@@ -58,9 +62,13 @@ class HashMatch(object): ...@@ -58,9 +62,13 @@ class HashMatch(object):
self.types[k] = re.compile(self.types[k]) self.types[k] = re.compile(self.types[k])
def _get_strings(self, fname): def _get_strings(self, fname):
return ''.join([string for (offset, string) in binwalk.smartstrings.FileStrings(fname, n=10).strings()]) return ''.join([string for (offset, string) in binwalk.smartstrings.FileStrings(fname, n=10, block=None).strings()])
def files(self, file1, file2): def _print(self, message):
if self.verbose:
print(message)
def _compare_files(self, file1, file2):
''' '''
Fuzzy diff two files. Fuzzy diff two files.
...@@ -73,7 +81,10 @@ class HashMatch(object): ...@@ -73,7 +81,10 @@ class HashMatch(object):
status = 0 status = 0
if not self.name or os.path.basename(file1) == os.path.basename(file2): if not self.name or os.path.basename(file1) == os.path.basename(file2):
if self.fuzzy: if os.path.exists(file1) and os.path.exists(file2):
self._print("Checking %s -> %s" % (file1, file2))
hash1 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT) hash1 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
hash2 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT) hash2 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
...@@ -98,11 +109,7 @@ class HashMatch(object): ...@@ -98,11 +109,7 @@ class HashMatch(object):
else: else:
return self.lib.fuzzy_compare(hash1, hash2) return self.lib.fuzzy_compare(hash1, hash2)
except Exception as e: except Exception as e:
print "WARNING: Exception while performing fuzzy comparison:", e print "WARNING: Exception while doing fuzzy hash:", e
elif not self.strings:
if file_md5(file1) == file_md5(file2):
return 100
return None return None
...@@ -111,7 +118,7 @@ class HashMatch(object): ...@@ -111,7 +118,7 @@ class HashMatch(object):
Returns True if the match value is greater than or equal to the cutoff. Returns True if the match value is greater than or equal to the cutoff.
Returns False if the match value is less than the cutoff. Returns False if the match value is less than the cutoff.
''' '''
return (match >= self.cutoff) return (match is not None and match >= self.cutoff)
def _get_file_list(self, directory): def _get_file_list(self, directory):
''' '''
...@@ -141,7 +148,11 @@ class HashMatch(object): ...@@ -141,7 +148,11 @@ class HashMatch(object):
if self.types: if self.types:
for f in files: for f in files:
for (include, type_regex) in iterator(self.types): for (include, type_regex) in iterator(self.types):
magic_result = self.magic.file(f).lower() try:
magic_result = self.magic.file(os.path.join(directory, f)).lower()
except Exception as e:
magic_result = ''
match = type_regex.match(magic_result) match = type_regex.match(magic_result)
# If this matched an include filter, or didn't match an exclude filter # If this matched an include filter, or didn't match an exclude filter
...@@ -162,6 +173,12 @@ class HashMatch(object): ...@@ -162,6 +173,12 @@ class HashMatch(object):
return set(file_list) return set(file_list)
def files(self, file1, file2):
m = self._compare_files(file1, file2)
if m is None:
m = 0
return [(m, file2)]
def file(self, fname, directories): def file(self, fname, directories):
''' '''
Search for a particular file in multiple directories. Search for a particular file in multiple directories.
...@@ -172,45 +189,57 @@ class HashMatch(object): ...@@ -172,45 +189,57 @@ class HashMatch(object):
Returns a list of tuple results. Returns a list of tuple results.
''' '''
matching_files = [] matching_files = []
self.total = 0
for directory in directories: for directory in directories:
for f in self._get_file_list(directory): for f in self._get_file_list(directory):
f = os.path.join(directory, f) f = os.path.join(directory, f)
m = self.files(fname, f) m = self._compare_files(fname, f)
if self.is_match(m): if m is not None and self.is_match(m):
matching_files.append((m, f)) matching_files.append((m, f))
self.total += 1
if self.max_results and self.total >= self.max_results:
return matching_files
return matching_files return matching_files
def directories(self, dir1, dir2): def directories(self, source, dir_list):
''' '''
Search two directories for matching files. Search two directories for matching files.
@dir1 - First directory. @source - Source directory to compare everything to.
@dir2 - Second directory. @dir_list - Compare files in source to files in these directories.
Returns a list of tuple results. Returns a list of tuple results.
''' '''
results = [] results = []
self.total = 0
source_files = self._get_file_list(source)
dir1_files = self._get_file_list(dir1) for directory in dir_list:
dir2_files = self._get_file_list(dir2) dir_files = self._get_file_list(directory)
for f in source_files:
if f in dir_files:
file1 = os.path.join(source, f)
file2 = os.path.join(directory, f)
for f in dir1_files: m = self._compare_files(file1, file2)
if f in dir2_files: if m is not None:
file1 = os.path.join(dir1, f) matches = self.is_match(m)
file2 = os.path.join(dir2, f)
m = self.files(file1, file2) if (matches and self.show_same) or (not matches and not self.show_same):
if m is not None: results.append(("%3d" % m, f))
matches = self.is_match(m)
if (matches and self.show_same) or (not matches and not self.show_same): self.total += 1
results.append(("%3d" % m, f)) if self.max_results and self.total >= self.max_results:
return results
if self.show_missing: if self.show_missing and len(dir_list) == 1:
results += [('---', f) for f in (dir1_files-dir2_files)] results += [('---', f) for f in (source_files-dir_files)]
results += [('+++', f) for f in (dir2_files-dir1_files)] results += [('+++', f) for f in (dir_files-source_files)]
return results return results
...@@ -218,7 +247,7 @@ class HashMatch(object): ...@@ -218,7 +247,7 @@ class HashMatch(object):
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
hmatch = HashMatch(strings=True, name=True) hmatch = HashMatch(strings=True, name=False, types={True:"^elf"})
print hmatch.file(sys.argv[1], sys.argv[2:]) print hmatch.file(sys.argv[1], sys.argv[2:])
#for (match, fname) in hmatch.directories(sys.argv[1], sys.argv[2]): #for (match, fname) in hmatch.directories(sys.argv[1], sys.argv[2]):
#for (match, fname) in hmatch.find_file(sys.argv[1], sys.argv[2:]): #for (match, fname) in hmatch.find_file(sys.argv[1], sys.argv[2:]):
......
...@@ -41,7 +41,7 @@ class FileStrings(object): ...@@ -41,7 +41,7 @@ class FileStrings(object):
@length - The number of bytes in the file to analyze. @length - The number of bytes in the file to analyze.
@offset - The starting offset into the file to begin analysis. @offset - The starting offset into the file to begin analysis.
@n - The minimum valid string length. @n - The minimum valid string length.
@block - The block size to use when performing entropy analysis. @block - The block size to use iwhen performing entropy analysis. Set to None to skip entropy analysis.
@algorithm - The entropy algorithm to use when performing entropy analysis. @algorithm - The entropy algorithm to use when performing entropy analysis.
@plugins - An instance of the Plugins class. @plugins - An instance of the Plugins class.
...@@ -59,22 +59,31 @@ class FileStrings(object): ...@@ -59,22 +59,31 @@ class FileStrings(object):
self.valid_strings = [] self.valid_strings = []
self.external_validators = [] self.external_validators = []
self.plugins = plugins self.plugins = plugins
self.block = block
if not self.n: if not self.n:
self.n = self.MIN_STRING_LENGTH self.n = self.MIN_STRING_LENGTH
# Perform an entropy analysis over the entire file (anything less may generate poor entropy data). if self.block is not None:
# Give fake file results list to prevent FileEntropy from doing too much analysis. # Perform an entropy analysis over the entire file (anything less may generate poor entropy data).
with entropy.FileEntropy(file_name, block=block, file_results=['foo']) as e: # Give fake file results list to prevent FileEntropy from doing too much analysis.
(self.x, self.y, self.average_entropy) = e.analyze(algorithm=algorithm) with entropy.FileEntropy(file_name, block=self.block, file_results=['foo']) as e:
for i in range(0, len(self.x)): (self.x, self.y, self.average_entropy) = e.analyze(algorithm=algorithm)
self.entropy[self.x[i]] = self.y[i] for i in range(0, len(self.x)):
# Make sure our block size matches the entropy analysis's block size self.entropy[self.x[i]] = self.y[i]
self.block = e.block # Make sure our block size matches the entropy analysis's block size
self.block = e.block
# Make sure the starting offset is a multiple of the block size; else, when later checking # Make sure the starting offset is a multiple of the block size; else, when later checking
# the entropy analysis, block offsets won't line up. # the entropy analysis, block offsets won't line up.
self.start -= (self.start % self.block) self.start -= (self.start % self.block)
else:
i = 0
self.block = common.BlockFile.READ_BLOCK_SIZE
# Fake the entropy scan
while i < common.file_size(file_name):
self.entropy[i] = 1.0
i += self.block
self.fd = common.BlockFile(file_name, 'r', length=length, offset=self.start) self.fd = common.BlockFile(file_name, 'r', length=length, offset=self.start)
# TODO: This is not optimal. We should read in larger chunks and process it into self.block chunks. # TODO: This is not optimal. We should read in larger chunks and process it into self.block chunks.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment