Added rehash. hashmatch.py now in working condition.

2ecfc772 · devttys0 · 3992d1cd · 2ecfc772 · 2ecfc772 · 2ecfc772
Commit 2ecfc772 authored Dec 04, 2013 by devttys0
Show whitespace changes
Inline Side-by-side

Showing with 259 additions and 89 deletions

rehash src/bin/rehash +101 -38

common.py src/binwalk/common.py +26 -0

hashmatch.py src/binwalk/hashmatch.py +128 -48

prettyprint.py src/binwalk/prettyprint.py +4 -3

No files found.
--- a/src/bin/rehash
+++ b/src/bin/rehash
@@ -3,13 +3,53 @@
 import os
 import re
 import sys
-import magic
 import binwalk.hashmatch as hashmatch
 from binwalk.compat import *
 from getopt import GetoptError, gnu_getopt as GetOpt
 def usage(fd):
-	fd.write("Usage: %s [OPTIONS] [FILE | DIR] [FILE | DIR] ...\n" % sys.argv[0])
+	fd.write("\n")
+	fd.write('Diff files or directories using Context Triggered Piecewise Hashing ("fuzzy" hashing).\n')
+	fd.write("Craig Heffner, http://www.devttys0.com\n")
+	fd.write("\n")
+	fd.write("Usage: %s [OPTIONS] [NEEDLE] [HAYSTACK] [HAYSTACK] [HAYSTACK] ...\n" % os.path.basename(sys.argv[0]))
+	fd.write("\n")
+	fd.write("NEEDLE may be a file or a directory.\n")
+	fd.write("HAYSTACKs must be either all files or all directories.\n")
+	fd.write("\n")
+	fd.write("Diffing Options:\n")
+	fd.write("\t-d, --diff                    Show files that are different (default)\n")
+	fd.write("\t-s, --same                    Show files that are the same\n")
+	fd.write("\t-S, --strings                 Diff strings inside files instead of the entire file\n")
+	fd.write("\t-c, --cutoff=<n>              Set the cutoff percentage (default: 50%)\n")
+	fd.write("\t-m, --max=<n>                 Quit after n number of matches\n")
+	fd.write("\n")
+	fd.write("Filtering Options:\n")
+	fd.write("\t-n, --name                    Only diff files whose base names are the same\n")
+	fd.write("\t-l, --symlinks                Don't ignore symlinks\n")
+	fd.write("\t-y, --include-file=<match>    Only diff against a specific file name (e.g., *.py, *.bin, etc)\n")
+	fd.write("\t-x, --exclude-file=<match>    Do not diff against a specific file name (e.g., *.py, *.bin, etc)\n")
+	fd.write("\t-Y, --include-type=<type>     Only diff against a certian file type (e.g., elf, jpeg, etc)\n")
+	fd.write("\t-X, --exclude-type=<type>     Do not diff against a certian file type (e.g., elf, jpeg, etc)\n")
+	fd.write("\n")
+	fd.write("General Options:\n")
+	fd.write("\t-f, --file=<file>             Log results to file\n")
+	fd.write("\t-c, --csv                     Log results to file in csv format\n")
+	fd.write("\t-q, --quiet                   Supress output to stdout\n")
+	fd.write("\t-t, --term                    Format output to fit the terminal window\n")
+	fd.write("\t-h, --help                    Show help\n")
+	fd.write("\n")
+	if fd == sys.stdout:
+		sys.exit(0)
+	else:
+		sys.exit(1)
 def main():
@@ -17,31 +57,39 @@ def main():
 	options = []
 	arguments = []
 	file_list = []
+	include_files = []
+	exclude_files = []
+	include_types = []
+	exclude_types = []
 	types = {}
+	matches = {}
+	log_file = None
+	log_csv = False
+	fit_to_width = False
+	quiet = False	
 	strings = False
 	symlinks = False
-	all_types = False
 	name = False
 	same = False
-	missing = False
 	cutoff = None
 	max_results = None
-	verbose = False
-	short_options = "c:hlmnSsvx:"
+	short_options = "cdf:hlm:no:qSstx:X:y:Y:"
 	long_options = [
 			"help",
 			"cutoff=",
 			"strings",
-			"show-same",
+			"same",
-			"show-missing",
+			"diff",
 			"max=",
 			"symlinks",
 			"name",
-			"file-type",
+			"file=",
-			"file-name",
+			"csv",
-			"verbose",
+			"term",
+			"quiet",
 	]
 	try:
@@ -59,16 +107,30 @@ def main():
 			symlinks = True
 		elif opt in ("-n", "--name"):
 			name = True
-		elif opt in ("-s", "--show-same"):
+		elif opt in ("-s", "--same"):
 			same = True
-		elif opt in ("-m", "--show-missing"):
+		elif opt in ("-d", "--diff"):
-			missing = True
+			same = False
-		elif opt in ("-x", "--max"):
+		elif opt in ("-t", "--term"):
+			fit_to_width = True
+		elif opt in ("-c", "--csv"):
+			log_csv = True
+		elif opt in ("-q", "--quiet"):
+			quiet = True
+		elif opt in ("-f", "--file"):
+			log_file = arg
+		elif opt in ("-m", "--max"):
 			max_results = int(arg, 0)
-		elif opt in ("-c", "--cutoff"):
+		elif opt in ("-o", "--cutoff"):
 			cutoff = int(arg, 0)
-		elif opt in ("-v", "--verbose"):
+		elif opt in ("-y", "--include-file"):
-			verbose = True
+			include_files.append(arg)
+		elif opt in ("-x", "--exclude-file"):
+			exclude_files.append(arg)
+		elif opt in ("-Y", "--include-type"):
+			include_types.append(arg.lower())
+		elif opt in ("-X", "--exclude-types"):
+			exclude_types.append(arg.lower())
 		# Keep track of the options and arguments.
 		# This is used later to determine which argv entries are file names.
@@ -82,38 +144,39 @@ def main():
 		if opt not in arguments and opt not in options and not opt.startswith('-'):
 			file_list.append(opt)
+	if include_files:
+		matches[True] = include_files
+	if exclude_files:
+		matches[False] = exclude_files
+	if include_types:
+		types[True] = include_types
+	if exclude_types:
+		types[False] = exclude_types
 	if len(file_list) >= 2:
 		rehash = hashmatch.HashMatch(cutoff=cutoff,
 						strings=strings,
+						same=same,
 						symlinks=symlinks, 
 						name=name, 
-						same=same,
-						missing=missing,
 						max_results=max_results,
-						verbose=verbose)
+						display=True,
+						quiet=quiet,
+						log=log_file,
+						csv=log_csv,
+						format_to_screen=fit_to_width,
+						types=types,
+						matches=matches)
 		if os.path.isfile(file_list[0]):
-			if not all_types and len(types) == 0:
-				m = magic.open(0)
-				m.load()
-				file_type = m.file(file_list[0])
-				if file_type:
-					types[True] = re.escape(file_type.lower())
 			if os.path.isfile(file_list[1]):
-				results = rehash.files(file_list[0], file_list[1])
+				rehash.files(file_list[0], file_list[1:])
 			else:
-				results = rehash.file(file_list[0], file_list[1:])
+				rehash.file(file_list[0], file_list[1:])
 		else:
-			for f in file_list:
+			rehash.directories(file_list[0], file_list[1:])
-				if not os.path.isdir(f):
-					print("Invalid usage")
-					usage(sys.stderr)
-			results = rehash.directories(file_list[0], file_list[1])
-	for (match, fname) in results:
-		print("%s  %s" % (match, fname))
 if __name__ == "__main__":
 	main()

--- a/src/binwalk/common.py
+++ b/src/binwalk/common.py
@@ -110,6 +110,32 @@ def unique_file_name(base_name, extension=''):
 	return fname
+def strings(filename, minimum=4):
+	'''
+	A strings generator, similar to the Unix strings utility.
+	@filename - The file to search for strings in.
+	@minimum  - The minimum string length to search for.
+	Yeilds printable ASCII strings from filename.
+	'''
+	result = ""
+	with BlockFile(filename) as f:
+		while True:
+			(data, dlen) = f.read_block()
+			if not data:
+				break
+			for c in data:
+				if c in string.printable:
+					result += c
+					continue
+				elif len(result) >= minimum:
+					yield result
+					result = ""
+				else:
+					result = ""
 class MathExpression(object):
 	'''

--- a/src/binwalk/hashmatch.py
+++ b/src/binwalk/hashmatch.py
@@ -6,7 +6,15 @@ import ctypes
 import ctypes.util
 import binwalk.smartstrings
 from binwalk.compat import *
-from binwalk.common import file_md5
+from binwalk.common import strings
+from binwalk.prettyprint import PrettyPrint
+class HashResult(object):
+	def __init__(self, name, hash=None, strings=None):
+		self.name = name
+		self.hash = hash
+		self.strings = strings
 class HashMatch(object):
@@ -20,35 +28,40 @@ class HashMatch(object):
 	FUZZY_DEFAULT_CUTOFF = 50
-	def __init__(self, cutoff=None, strings=False, same=False, missing=False, symlinks=False, name=False, max_results=None, matches={}, types={}, verbose=False):
+	def __init__(self, cutoff=None, strings=False, same=False, symlinks=False, name=False, max_results=None, display=False, log=None, csv=False, quiet=False, format_to_screen=False, matches={}, types={}):
 		'''
 		Class constructor.
 		@cutoff          - The fuzzy cutoff which determines if files are different or not.
 		@strings         - Only hash strings inside of the file, not the entire file itself.
 		@same            - Set to True to show files that are the same, False to show files that are different.
-		@missing         - Set to True to show missing files.
 		@symlinks        - Set to True to include symbolic link files.
 		@name            - Set to True to only compare files whose base names match.
 		@max_results     - Stop searching after x number of matches.
+		@display         - Set to True to display results to stdout.
 		@matches         - A dictionary of file names to diff.
 		@types           - A dictionary of file types to diff.
-		@verbose         - Enable verbose mode.
 		Returns None.
 		'''
 		self.cutoff = cutoff
 		self.strings = strings
 		self.show_same = same
-		self.show_missing = missing
 		self.symlinks = symlinks
 		self.matches = matches
 		self.name = name
 		self.types = types
 		self.max_results = max_results
-		self.verbose = verbose
+		if display:
+			self.pretty_print = PrettyPrint(log=log, csv=csv, format_to_screen=format_to_screen, quiet=quiet)
+			self.pretty_print.header(header="PERCENTAGE\tFILE NAME")
+		else:
+			self.pretty_print = None
 		self.total = 0
+		self.last_file1 = HashResult(None)
+		self.last_file2 = HashResult(None)
 		self.magic = magic.open(0)
 		self.magic.load()
@@ -59,14 +72,19 @@ class HashMatch(object):
 			self.cutoff = self.FUZZY_DEFAULT_CUTOFF
 		for k in get_keys(self.types):
-			self.types[k] = re.compile(self.types[k])
+			for i in range(0, len(self.types[k])):
+				self.types[k][i] = re.compile(self.types[k][i])
 	def _get_strings(self, fname):
-		return ''.join([string for (offset, string) in binwalk.smartstrings.FileStrings(fname, n=10, block=None).strings()])
+		return ''.join(list(binwalk.common.strings(fname, minimum=10)))
-	def _print(self, message):
+	def _print(self, match, fname):
-		if self.verbose:
+		if self.pretty_print:
-			print(message)
+			self.pretty_print.results(None, [{'description' : '%4d\t\t%s\n' % (match, fname)}], formatted=True)
+	def _print_footer(self):
+		if self.pretty_print:
+			self.pretty_print.footer()
 	def _compare_files(self, file1, file2):
 		'''
@@ -79,31 +97,67 @@ class HashMatch(object):
 		Returns None on error.
 		'''
 		status = 0
+		file1_dup = False
+		file2_dup = False
 		if not self.name or os.path.basename(file1) == os.path.basename(file2):
 			if os.path.exists(file1) and os.path.exists(file2):
-				self._print("Checking %s -> %s" % (file1, file2))
 				hash1 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
 				hash2 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
+				if file1 == self.last_file1.name and self.last_file1.hash:
+					file1_dup = True
+				else:
+					self.last_file1.name = file1
+				if file2 == self.last_file2.name and self.last_file2.hash:
+					file2_dup = True
+				else:
+					self.last_file2.name = file2
 				try:
 					if self.strings:
-						file1_strings = self._get_strings(file1)
+						if file1_dup:
-						file2_strings = self._get_strings(file2)
+							file1_strings = self.last_file1.strings
+						else:
+							self.last_file1.strings = file1_strings = self._get_strings(file1)
+						if file2_dup:
+							file2_strings = self.last_file2.strings
+						else:
+							self.last_file2.strings = file2_strings = self._get_strings(file2)
 						if file1_strings == file2_strings:
 							return 100
 						else:
+							if file1_dup:
+								hash1 = self.last_file1.hash
+							else:
 								status |= self.lib.fuzzy_hash_buf(str2bytes(file1_strings), len(file1_strings), hash1)
+							if file2_dup:
+								hash2 = self.last_file2.hash
+							else:
 								status |= self.lib.fuzzy_hash_buf(str2bytes(file2_strings), len(file2_strings), hash2)
 					else:
+						if file1_dup:
+							hash1 = self.last_file1.hash
+						else:
 							status |= self.lib.fuzzy_hash_filename(str2bytes(file1), hash1)
+						if file2_dup:
+							hash2 = self.last_file2.hash
+						else:
 							status |= self.lib.fuzzy_hash_filename(str2bytes(file2), hash2)
 					if status == 0:
+						if not file1_dup:
+							self.last_file1.hash = hash1
+						if not file2_dup:
+							self.last_file2.hash = hash2
 						if hash1.raw == hash2.raw:
 							return 100
 						else:
@@ -115,10 +169,10 @@ class HashMatch(object):
 	def is_match(self, match):
 		'''
-		Returns True if the match value is greater than or equal to the cutoff.
+		Returns True if this is a good match.
-		Returns False if the match value is less than the cutoff.
+		Returns False if his is not a good match.
 		'''
-		return (match is not None and match >= self.cutoff)
+		return (match is not None and ((match >= self.cutoff and self.show_same) or (match < self.cutoff and not self.show_same)))
 	def _get_file_list(self, directory):
 		'''
@@ -147,13 +201,14 @@ class HashMatch(object):
 				# Filter based on the file type, as reported by libmagic
 				if self.types:
 					for f in files:
-						for (include, type_regex) in iterator(self.types):
+						for (include, regex_list) in iterator(self.types):
+							for regex in regex_list:
 								try:
 									magic_result = self.magic.file(os.path.join(directory, f)).lower()
 								except Exception as e:
 									magic_result = ''
-							match = type_regex.match(magic_result)
+								match = regex.match(magic_result)
 								# If this matched an include filter, or didn't match an exclude filter
 								if (match and include) or (not match and not include):
@@ -161,7 +216,8 @@ class HashMatch(object):
 				# Filter based on file name
 				if self.matches:
-					for (include, file_filter) in iterator(self.matches):
+					for (include, file_filter_list) in iterator(self.matches):
+						for file_filter in file_filter_list:
 							matching_files = fnmatch.filter(files, file_filter)
 							# If this is an include filter, add all matching files to the list
@@ -173,74 +229,98 @@ class HashMatch(object):
 		return set(file_list)
-	def files(self, file1, file2):
+	def files(self, needle, haystack):
-		m = self._compare_files(file1, file2)
+		'''
-		if m is None:
+		Compare one file against a list of other files.
-			m = 0
-		return [(m, file2)]
-	def file(self, fname, directories):
+		@needle   - File to match against.
+		@haystack - A list of haystack files.
+		Returns a list of tuple results.
 		'''
-		Search for a particular file in multiple directories.
+		results = []
+		self.total = 0
+		for f in haystack:
+			m = self._compare_files(needle, f)
+			if m is not None and self.is_match(m):
+				self._print(m, f)
+				results.append((m, f))
-		@fname       - File to search for.
+				self.total += 1
-		@directories - List of directories to search in.
+				if self.max_results and self.total >= self.max_results:
+					break
+		self._print_footer()
+		return results
+	def file(self, needle, haystack):
+		'''
+		Search for one file inside one or more directories.
+		@needle   - File to search for.
+		@haystack - List of directories to search in.
 		Returns a list of tuple results.
 		'''
 		matching_files = []
 		self.total = 0
+		done = False
-		for directory in directories:
+		for directory in haystack:
 			for f in self._get_file_list(directory):
 				f = os.path.join(directory, f)
-				m = self._compare_files(fname, f)
+				m = self._compare_files(needle, f)
 				if m is not None and self.is_match(m):
+					self._print(m, f)
 					matching_files.append((m, f))
 					self.total += 1
 					if self.max_results and self.total >= self.max_results:
-						return matching_files
+						done = True
+						break
+			if done:
+				break
+		self._print_footer()
 		return matching_files
-	def directories(self, source, dir_list):
+	def directories(self, needle, haystack):
 		'''
-		Search two directories for matching files.
+		Compare the contents of one directory with the contents of other directories.
 		@source   - Source directory to compare everything to.
 		@dir_list - Compare files in source to files in these directories.
 		Returns a list of tuple results.
 		'''
+		done = False
 		results = []
 		self.total = 0
-		source_files = self._get_file_list(source)
+		source_files = self._get_file_list(needle)
-		for directory in dir_list:
+		for directory in haystack:
 			dir_files = self._get_file_list(directory)
 			for f in source_files:
 				if f in dir_files:
-					file1 = os.path.join(source, f)
+					file1 = os.path.join(needle, f)
 					file2 = os.path.join(directory, f)
 					m = self._compare_files(file1, file2)
-					if m is not None:
+					if m is not None and self.is_match(m):
-						matches = self.is_match(m)
+						self._print(m, f)
+						results.append((m, f))
-						if (matches and self.show_same) or (not matches and not self.show_same):
-							results.append(("%3d" % m, f))
 						self.total += 1
 						if self.max_results and self.total >= self.max_results:
-								return results
+							done = True
+							break
-		if self.show_missing and len(dir_list) == 1:
+			if done:
-			results += [('---', f) for f in (source_files-dir_files)]
+				break
-			results += [('+++', f) for f in (dir_files-source_files)]
+		self._print_footer()
 		return results

--- a/src/binwalk/prettyprint.py
+++ b/src/binwalk/prettyprint.py
@@ -37,7 +37,7 @@ class PrettyPrint:
 	MAX_LINE_LEN = 0
 	DEFAULT_DESCRIPTION_HEADER = "DESCRIPTION"
-	def __init__(self, binwalk, log=None, csv=False, quiet=False, verbose=0, format_to_screen=False):
+	def __init__(self, binwalk=None, log=None, csv=False, quiet=False, verbose=0, format_to_screen=False):
 		'''
 		Class constructor.
@@ -109,7 +109,7 @@ class PrettyPrint:
 				data_parts = data.split(None, 2)
-				if len(data_parts) == 3:
+				if len(data_parts) in [2,3]:
 					for i in range(0, len(data_parts)):
 						data_parts[i] = data_parts[i].strip()
@@ -223,6 +223,7 @@ class PrettyPrint:
 		self._pprint("\n")
 		self._pprint("Scan Time:     %s\n" % timestamp, nolog=nolog)
+		if self.binwalk:
 			self._pprint("Signatures:    %d\n" % self.binwalk.parser.signature_count, nolog=nolog)
 		self._pprint("Target File:   %s\n" % file_name, nolog=nolog)
 		self._pprint("MD5 Checksum:  %s\n" % md5sum, nolog=nolog)
@@ -276,7 +277,7 @@ class PrettyPrint:
 		for info in results:
 			# Check for any grep filters before printing
-			if self.binwalk.filter.grep(info['description']):
+			if not self.binwalk or self.binwalk.filter.grep(info['description']):
 				if not formatted:
 					# Only display the offset once per list of results
 					if not offset_printed: