Commit 2ecfc772 by devttys0

Added rehash. hashmatch.py now in working condition.

parent 3992d1cd
...@@ -3,13 +3,53 @@ ...@@ -3,13 +3,53 @@
import os import os
import re import re
import sys import sys
import magic
import binwalk.hashmatch as hashmatch import binwalk.hashmatch as hashmatch
from binwalk.compat import * from binwalk.compat import *
from getopt import GetoptError, gnu_getopt as GetOpt from getopt import GetoptError, gnu_getopt as GetOpt
def usage(fd): def usage(fd):
fd.write("Usage: %s [OPTIONS] [FILE | DIR] [FILE | DIR] ...\n" % sys.argv[0]) fd.write("\n")
fd.write('Diff files or directories using Context Triggered Piecewise Hashing ("fuzzy" hashing).\n')
fd.write("Craig Heffner, http://www.devttys0.com\n")
fd.write("\n")
fd.write("Usage: %s [OPTIONS] [NEEDLE] [HAYSTACK] [HAYSTACK] [HAYSTACK] ...\n" % os.path.basename(sys.argv[0]))
fd.write("\n")
fd.write("NEEDLE may be a file or a directory.\n")
fd.write("HAYSTACKs must be either all files or all directories.\n")
fd.write("\n")
fd.write("Diffing Options:\n")
fd.write("\t-d, --diff Show files that are different (default)\n")
fd.write("\t-s, --same Show files that are the same\n")
fd.write("\t-S, --strings Diff strings inside files instead of the entire file\n")
fd.write("\t-c, --cutoff=<n> Set the cutoff percentage (default: 50%)\n")
fd.write("\t-m, --max=<n> Quit after n number of matches\n")
fd.write("\n")
fd.write("Filtering Options:\n")
fd.write("\t-n, --name Only diff files whose base names are the same\n")
fd.write("\t-l, --symlinks Don't ignore symlinks\n")
fd.write("\t-y, --include-file=<match> Only diff against a specific file name (e.g., *.py, *.bin, etc)\n")
fd.write("\t-x, --exclude-file=<match> Do not diff against a specific file name (e.g., *.py, *.bin, etc)\n")
fd.write("\t-Y, --include-type=<type> Only diff against a certian file type (e.g., elf, jpeg, etc)\n")
fd.write("\t-X, --exclude-type=<type> Do not diff against a certian file type (e.g., elf, jpeg, etc)\n")
fd.write("\n")
fd.write("General Options:\n")
fd.write("\t-f, --file=<file> Log results to file\n")
fd.write("\t-c, --csv Log results to file in csv format\n")
fd.write("\t-q, --quiet Supress output to stdout\n")
fd.write("\t-t, --term Format output to fit the terminal window\n")
fd.write("\t-h, --help Show help\n")
fd.write("\n")
if fd == sys.stdout:
sys.exit(0)
else:
sys.exit(1)
def main(): def main():
...@@ -17,31 +57,39 @@ def main(): ...@@ -17,31 +57,39 @@ def main():
options = [] options = []
arguments = [] arguments = []
file_list = [] file_list = []
include_files = []
exclude_files = []
include_types = []
exclude_types = []
types = {} types = {}
matches = {}
log_file = None
log_csv = False
fit_to_width = False
quiet = False
strings = False strings = False
symlinks = False symlinks = False
all_types = False
name = False name = False
same = False same = False
missing = False
cutoff = None cutoff = None
max_results = None max_results = None
verbose = False
short_options = "c:hlmnSsvx:" short_options = "cdf:hlm:no:qSstx:X:y:Y:"
long_options = [ long_options = [
"help", "help",
"cutoff=", "cutoff=",
"strings", "strings",
"show-same", "same",
"show-missing", "diff",
"max=", "max=",
"symlinks", "symlinks",
"name", "name",
"file-type", "file=",
"file-name", "csv",
"verbose", "term",
"quiet",
] ]
try: try:
...@@ -59,16 +107,30 @@ def main(): ...@@ -59,16 +107,30 @@ def main():
symlinks = True symlinks = True
elif opt in ("-n", "--name"): elif opt in ("-n", "--name"):
name = True name = True
elif opt in ("-s", "--show-same"): elif opt in ("-s", "--same"):
same = True same = True
elif opt in ("-m", "--show-missing"): elif opt in ("-d", "--diff"):
missing = True same = False
elif opt in ("-x", "--max"): elif opt in ("-t", "--term"):
fit_to_width = True
elif opt in ("-c", "--csv"):
log_csv = True
elif opt in ("-q", "--quiet"):
quiet = True
elif opt in ("-f", "--file"):
log_file = arg
elif opt in ("-m", "--max"):
max_results = int(arg, 0) max_results = int(arg, 0)
elif opt in ("-c", "--cutoff"): elif opt in ("-o", "--cutoff"):
cutoff = int(arg, 0) cutoff = int(arg, 0)
elif opt in ("-v", "--verbose"): elif opt in ("-y", "--include-file"):
verbose = True include_files.append(arg)
elif opt in ("-x", "--exclude-file"):
exclude_files.append(arg)
elif opt in ("-Y", "--include-type"):
include_types.append(arg.lower())
elif opt in ("-X", "--exclude-types"):
exclude_types.append(arg.lower())
# Keep track of the options and arguments. # Keep track of the options and arguments.
# This is used later to determine which argv entries are file names. # This is used later to determine which argv entries are file names.
...@@ -82,38 +144,39 @@ def main(): ...@@ -82,38 +144,39 @@ def main():
if opt not in arguments and opt not in options and not opt.startswith('-'): if opt not in arguments and opt not in options and not opt.startswith('-'):
file_list.append(opt) file_list.append(opt)
if include_files:
matches[True] = include_files
if exclude_files:
matches[False] = exclude_files
if include_types:
types[True] = include_types
if exclude_types:
types[False] = exclude_types
if len(file_list) >= 2: if len(file_list) >= 2:
rehash = hashmatch.HashMatch(cutoff=cutoff, rehash = hashmatch.HashMatch(cutoff=cutoff,
strings=strings, strings=strings,
same=same,
symlinks=symlinks, symlinks=symlinks,
name=name, name=name,
same=same,
missing=missing,
max_results=max_results, max_results=max_results,
verbose=verbose) display=True,
quiet=quiet,
log=log_file,
csv=log_csv,
format_to_screen=fit_to_width,
types=types,
matches=matches)
if os.path.isfile(file_list[0]): if os.path.isfile(file_list[0]):
if not all_types and len(types) == 0:
m = magic.open(0)
m.load()
file_type = m.file(file_list[0])
if file_type:
types[True] = re.escape(file_type.lower())
if os.path.isfile(file_list[1]): if os.path.isfile(file_list[1]):
results = rehash.files(file_list[0], file_list[1]) rehash.files(file_list[0], file_list[1:])
else: else:
results = rehash.file(file_list[0], file_list[1:]) rehash.file(file_list[0], file_list[1:])
else: else:
for f in file_list: rehash.directories(file_list[0], file_list[1:])
if not os.path.isdir(f):
print("Invalid usage")
usage(sys.stderr)
results = rehash.directories(file_list[0], file_list[1])
for (match, fname) in results:
print("%s %s" % (match, fname))
if __name__ == "__main__": if __name__ == "__main__":
main() main()
......
...@@ -110,6 +110,32 @@ def unique_file_name(base_name, extension=''): ...@@ -110,6 +110,32 @@ def unique_file_name(base_name, extension=''):
return fname return fname
def strings(filename, minimum=4):
'''
A strings generator, similar to the Unix strings utility.
@filename - The file to search for strings in.
@minimum - The minimum string length to search for.
Yeilds printable ASCII strings from filename.
'''
result = ""
with BlockFile(filename) as f:
while True:
(data, dlen) = f.read_block()
if not data:
break
for c in data:
if c in string.printable:
result += c
continue
elif len(result) >= minimum:
yield result
result = ""
else:
result = ""
class MathExpression(object): class MathExpression(object):
''' '''
......
...@@ -6,7 +6,15 @@ import ctypes ...@@ -6,7 +6,15 @@ import ctypes
import ctypes.util import ctypes.util
import binwalk.smartstrings import binwalk.smartstrings
from binwalk.compat import * from binwalk.compat import *
from binwalk.common import file_md5 from binwalk.common import strings
from binwalk.prettyprint import PrettyPrint
class HashResult(object):
def __init__(self, name, hash=None, strings=None):
self.name = name
self.hash = hash
self.strings = strings
class HashMatch(object): class HashMatch(object):
...@@ -20,35 +28,40 @@ class HashMatch(object): ...@@ -20,35 +28,40 @@ class HashMatch(object):
FUZZY_DEFAULT_CUTOFF = 50 FUZZY_DEFAULT_CUTOFF = 50
def __init__(self, cutoff=None, strings=False, same=False, missing=False, symlinks=False, name=False, max_results=None, matches={}, types={}, verbose=False): def __init__(self, cutoff=None, strings=False, same=False, symlinks=False, name=False, max_results=None, display=False, log=None, csv=False, quiet=False, format_to_screen=False, matches={}, types={}):
''' '''
Class constructor. Class constructor.
@cutoff - The fuzzy cutoff which determines if files are different or not. @cutoff - The fuzzy cutoff which determines if files are different or not.
@strings - Only hash strings inside of the file, not the entire file itself. @strings - Only hash strings inside of the file, not the entire file itself.
@same - Set to True to show files that are the same, False to show files that are different. @same - Set to True to show files that are the same, False to show files that are different.
@missing - Set to True to show missing files.
@symlinks - Set to True to include symbolic link files. @symlinks - Set to True to include symbolic link files.
@name - Set to True to only compare files whose base names match. @name - Set to True to only compare files whose base names match.
@max_results - Stop searching after x number of matches. @max_results - Stop searching after x number of matches.
@display - Set to True to display results to stdout.
@matches - A dictionary of file names to diff. @matches - A dictionary of file names to diff.
@types - A dictionary of file types to diff. @types - A dictionary of file types to diff.
@verbose - Enable verbose mode.
Returns None. Returns None.
''' '''
self.cutoff = cutoff self.cutoff = cutoff
self.strings = strings self.strings = strings
self.show_same = same self.show_same = same
self.show_missing = missing
self.symlinks = symlinks self.symlinks = symlinks
self.matches = matches self.matches = matches
self.name = name self.name = name
self.types = types self.types = types
self.max_results = max_results self.max_results = max_results
self.verbose = verbose
if display:
self.pretty_print = PrettyPrint(log=log, csv=csv, format_to_screen=format_to_screen, quiet=quiet)
self.pretty_print.header(header="PERCENTAGE\tFILE NAME")
else:
self.pretty_print = None
self.total = 0 self.total = 0
self.last_file1 = HashResult(None)
self.last_file2 = HashResult(None)
self.magic = magic.open(0) self.magic = magic.open(0)
self.magic.load() self.magic.load()
...@@ -59,14 +72,19 @@ class HashMatch(object): ...@@ -59,14 +72,19 @@ class HashMatch(object):
self.cutoff = self.FUZZY_DEFAULT_CUTOFF self.cutoff = self.FUZZY_DEFAULT_CUTOFF
for k in get_keys(self.types): for k in get_keys(self.types):
self.types[k] = re.compile(self.types[k]) for i in range(0, len(self.types[k])):
self.types[k][i] = re.compile(self.types[k][i])
def _get_strings(self, fname): def _get_strings(self, fname):
return ''.join([string for (offset, string) in binwalk.smartstrings.FileStrings(fname, n=10, block=None).strings()]) return ''.join(list(binwalk.common.strings(fname, minimum=10)))
def _print(self, message): def _print(self, match, fname):
if self.verbose: if self.pretty_print:
print(message) self.pretty_print.results(None, [{'description' : '%4d\t\t%s\n' % (match, fname)}], formatted=True)
def _print_footer(self):
if self.pretty_print:
self.pretty_print.footer()
def _compare_files(self, file1, file2): def _compare_files(self, file1, file2):
''' '''
...@@ -79,31 +97,67 @@ class HashMatch(object): ...@@ -79,31 +97,67 @@ class HashMatch(object):
Returns None on error. Returns None on error.
''' '''
status = 0 status = 0
file1_dup = False
file2_dup = False
if not self.name or os.path.basename(file1) == os.path.basename(file2): if not self.name or os.path.basename(file1) == os.path.basename(file2):
if os.path.exists(file1) and os.path.exists(file2): if os.path.exists(file1) and os.path.exists(file2):
self._print("Checking %s -> %s" % (file1, file2))
hash1 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT) hash1 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
hash2 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT) hash2 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
if file1 == self.last_file1.name and self.last_file1.hash:
file1_dup = True
else:
self.last_file1.name = file1
if file2 == self.last_file2.name and self.last_file2.hash:
file2_dup = True
else:
self.last_file2.name = file2
try: try:
if self.strings: if self.strings:
file1_strings = self._get_strings(file1) if file1_dup:
file2_strings = self._get_strings(file2) file1_strings = self.last_file1.strings
else:
self.last_file1.strings = file1_strings = self._get_strings(file1)
if file2_dup:
file2_strings = self.last_file2.strings
else:
self.last_file2.strings = file2_strings = self._get_strings(file2)
if file1_strings == file2_strings: if file1_strings == file2_strings:
return 100 return 100
else: else:
if file1_dup:
hash1 = self.last_file1.hash
else:
status |= self.lib.fuzzy_hash_buf(str2bytes(file1_strings), len(file1_strings), hash1) status |= self.lib.fuzzy_hash_buf(str2bytes(file1_strings), len(file1_strings), hash1)
if file2_dup:
hash2 = self.last_file2.hash
else:
status |= self.lib.fuzzy_hash_buf(str2bytes(file2_strings), len(file2_strings), hash2) status |= self.lib.fuzzy_hash_buf(str2bytes(file2_strings), len(file2_strings), hash2)
else: else:
if file1_dup:
hash1 = self.last_file1.hash
else:
status |= self.lib.fuzzy_hash_filename(str2bytes(file1), hash1) status |= self.lib.fuzzy_hash_filename(str2bytes(file1), hash1)
if file2_dup:
hash2 = self.last_file2.hash
else:
status |= self.lib.fuzzy_hash_filename(str2bytes(file2), hash2) status |= self.lib.fuzzy_hash_filename(str2bytes(file2), hash2)
if status == 0: if status == 0:
if not file1_dup:
self.last_file1.hash = hash1
if not file2_dup:
self.last_file2.hash = hash2
if hash1.raw == hash2.raw: if hash1.raw == hash2.raw:
return 100 return 100
else: else:
...@@ -115,10 +169,10 @@ class HashMatch(object): ...@@ -115,10 +169,10 @@ class HashMatch(object):
def is_match(self, match): def is_match(self, match):
''' '''
Returns True if the match value is greater than or equal to the cutoff. Returns True if this is a good match.
Returns False if the match value is less than the cutoff. Returns False if his is not a good match.
''' '''
return (match is not None and match >= self.cutoff) return (match is not None and ((match >= self.cutoff and self.show_same) or (match < self.cutoff and not self.show_same)))
def _get_file_list(self, directory): def _get_file_list(self, directory):
''' '''
...@@ -147,13 +201,14 @@ class HashMatch(object): ...@@ -147,13 +201,14 @@ class HashMatch(object):
# Filter based on the file type, as reported by libmagic # Filter based on the file type, as reported by libmagic
if self.types: if self.types:
for f in files: for f in files:
for (include, type_regex) in iterator(self.types): for (include, regex_list) in iterator(self.types):
for regex in regex_list:
try: try:
magic_result = self.magic.file(os.path.join(directory, f)).lower() magic_result = self.magic.file(os.path.join(directory, f)).lower()
except Exception as e: except Exception as e:
magic_result = '' magic_result = ''
match = type_regex.match(magic_result) match = regex.match(magic_result)
# If this matched an include filter, or didn't match an exclude filter # If this matched an include filter, or didn't match an exclude filter
if (match and include) or (not match and not include): if (match and include) or (not match and not include):
...@@ -161,7 +216,8 @@ class HashMatch(object): ...@@ -161,7 +216,8 @@ class HashMatch(object):
# Filter based on file name # Filter based on file name
if self.matches: if self.matches:
for (include, file_filter) in iterator(self.matches): for (include, file_filter_list) in iterator(self.matches):
for file_filter in file_filter_list:
matching_files = fnmatch.filter(files, file_filter) matching_files = fnmatch.filter(files, file_filter)
# If this is an include filter, add all matching files to the list # If this is an include filter, add all matching files to the list
...@@ -173,74 +229,98 @@ class HashMatch(object): ...@@ -173,74 +229,98 @@ class HashMatch(object):
return set(file_list) return set(file_list)
def files(self, file1, file2): def files(self, needle, haystack):
m = self._compare_files(file1, file2) '''
if m is None: Compare one file against a list of other files.
m = 0
return [(m, file2)]
def file(self, fname, directories): @needle - File to match against.
@haystack - A list of haystack files.
Returns a list of tuple results.
''' '''
Search for a particular file in multiple directories. results = []
self.total = 0
for f in haystack:
m = self._compare_files(needle, f)
if m is not None and self.is_match(m):
self._print(m, f)
results.append((m, f))
@fname - File to search for. self.total += 1
@directories - List of directories to search in. if self.max_results and self.total >= self.max_results:
break
self._print_footer()
return results
def file(self, needle, haystack):
'''
Search for one file inside one or more directories.
@needle - File to search for.
@haystack - List of directories to search in.
Returns a list of tuple results. Returns a list of tuple results.
''' '''
matching_files = [] matching_files = []
self.total = 0 self.total = 0
done = False
for directory in directories: for directory in haystack:
for f in self._get_file_list(directory): for f in self._get_file_list(directory):
f = os.path.join(directory, f) f = os.path.join(directory, f)
m = self._compare_files(fname, f) m = self._compare_files(needle, f)
if m is not None and self.is_match(m): if m is not None and self.is_match(m):
self._print(m, f)
matching_files.append((m, f)) matching_files.append((m, f))
self.total += 1 self.total += 1
if self.max_results and self.total >= self.max_results: if self.max_results and self.total >= self.max_results:
return matching_files done = True
break
if done:
break
self._print_footer()
return matching_files return matching_files
def directories(self, source, dir_list): def directories(self, needle, haystack):
''' '''
Search two directories for matching files. Compare the contents of one directory with the contents of other directories.
@source - Source directory to compare everything to. @source - Source directory to compare everything to.
@dir_list - Compare files in source to files in these directories. @dir_list - Compare files in source to files in these directories.
Returns a list of tuple results. Returns a list of tuple results.
''' '''
done = False
results = [] results = []
self.total = 0 self.total = 0
source_files = self._get_file_list(source) source_files = self._get_file_list(needle)
for directory in dir_list: for directory in haystack:
dir_files = self._get_file_list(directory) dir_files = self._get_file_list(directory)
for f in source_files: for f in source_files:
if f in dir_files: if f in dir_files:
file1 = os.path.join(source, f) file1 = os.path.join(needle, f)
file2 = os.path.join(directory, f) file2 = os.path.join(directory, f)
m = self._compare_files(file1, file2) m = self._compare_files(file1, file2)
if m is not None: if m is not None and self.is_match(m):
matches = self.is_match(m) self._print(m, f)
results.append((m, f))
if (matches and self.show_same) or (not matches and not self.show_same):
results.append(("%3d" % m, f))
self.total += 1 self.total += 1
if self.max_results and self.total >= self.max_results: if self.max_results and self.total >= self.max_results:
return results done = True
break
if self.show_missing and len(dir_list) == 1: if done:
results += [('---', f) for f in (source_files-dir_files)] break
results += [('+++', f) for f in (dir_files-source_files)]
self._print_footer()
return results return results
......
...@@ -37,7 +37,7 @@ class PrettyPrint: ...@@ -37,7 +37,7 @@ class PrettyPrint:
MAX_LINE_LEN = 0 MAX_LINE_LEN = 0
DEFAULT_DESCRIPTION_HEADER = "DESCRIPTION" DEFAULT_DESCRIPTION_HEADER = "DESCRIPTION"
def __init__(self, binwalk, log=None, csv=False, quiet=False, verbose=0, format_to_screen=False): def __init__(self, binwalk=None, log=None, csv=False, quiet=False, verbose=0, format_to_screen=False):
''' '''
Class constructor. Class constructor.
...@@ -109,7 +109,7 @@ class PrettyPrint: ...@@ -109,7 +109,7 @@ class PrettyPrint:
data_parts = data.split(None, 2) data_parts = data.split(None, 2)
if len(data_parts) == 3: if len(data_parts) in [2,3]:
for i in range(0, len(data_parts)): for i in range(0, len(data_parts)):
data_parts[i] = data_parts[i].strip() data_parts[i] = data_parts[i].strip()
...@@ -223,6 +223,7 @@ class PrettyPrint: ...@@ -223,6 +223,7 @@ class PrettyPrint:
self._pprint("\n") self._pprint("\n")
self._pprint("Scan Time: %s\n" % timestamp, nolog=nolog) self._pprint("Scan Time: %s\n" % timestamp, nolog=nolog)
if self.binwalk:
self._pprint("Signatures: %d\n" % self.binwalk.parser.signature_count, nolog=nolog) self._pprint("Signatures: %d\n" % self.binwalk.parser.signature_count, nolog=nolog)
self._pprint("Target File: %s\n" % file_name, nolog=nolog) self._pprint("Target File: %s\n" % file_name, nolog=nolog)
self._pprint("MD5 Checksum: %s\n" % md5sum, nolog=nolog) self._pprint("MD5 Checksum: %s\n" % md5sum, nolog=nolog)
...@@ -276,7 +277,7 @@ class PrettyPrint: ...@@ -276,7 +277,7 @@ class PrettyPrint:
for info in results: for info in results:
# Check for any grep filters before printing # Check for any grep filters before printing
if self.binwalk.filter.grep(info['description']): if not self.binwalk or self.binwalk.filter.grep(info['description']):
if not formatted: if not formatted:
# Only display the offset once per list of results # Only display the offset once per list of results
if not offset_printed: if not offset_printed:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment