Initial commit of rehash; some bug fixes / feature additions left to do in hashmatch.py

3992d1cd · devttys0 · 48a1a48b · 3992d1cd · 3992d1cd · 3992d1cd
Commit 3992d1cd authored Dec 04, 2013 by devttys0
Showing with 205 additions and 46 deletions

rehash src/bin/rehash +121 -0

hashmatch.py src/binwalk/hashmatch.py +62 -33

binwalk src/binwalk/magic/binwalk +0 -0

smartstrings.py src/binwalk/smartstrings.py +22 -13

No files found.
--- a/src/bin/rehash
+++ b/src/bin/rehash
+#!/usr/bin/env python
+import os
+import re
+import sys
+import magic
+import binwalk.hashmatch as hashmatch
+from binwalk.compat import *
+from getopt import GetoptError, gnu_getopt as GetOpt
+def usage(fd):
+	fd.write("Usage: %s [OPTIONS] [FILE | DIR] [FILE | DIR] ...\n" % sys.argv[0])
+def main():
+	results = []
+	options = []
+	arguments = []
+	file_list = []
+	types = {}
+	strings = False
+	symlinks = False
+	all_types = False
+	name = False
+	same = False
+	missing = False
+	cutoff = None
+	max_results = None
+	verbose = False
+	short_options = "c:hlmnSsvx:"
+	long_options = [
+			"help",
+			"cutoff=",
+			"strings",
+			"show-same",
+			"show-missing",
+			"max=",
+			"symlinks",
+			"name",
+			"file-type",
+			"file-name",
+			"verbose",
+	]
+	try:
+		opts, args = GetOpt(sys.argv[1:], short_options, long_options)
+	except GetoptError as e:
+		sys.stderr.write("%s\n" % str(e))
+		usage(sys.stderr)
+	for opt, arg in opts:
+		if opt in ("-h", "--help"):
+			usage(sys.stdout)
+		elif opt in ("-S", "--strings"):
+			strings = True
+		elif opt in ("-l", "--symlinks"):
+			symlinks = True
+		elif opt in ("-n", "--name"):
+			name = True
+		elif opt in ("-s", "--show-same"):
+			same = True
+		elif opt in ("-m", "--show-missing"):
+			missing = True
+		elif opt in ("-x", "--max"):
+			max_results = int(arg, 0)
+		elif opt in ("-c", "--cutoff"):
+			cutoff = int(arg, 0)
+		elif opt in ("-v", "--verbose"):
+			verbose = True
+		# Keep track of the options and arguments.
+		# This is used later to determine which argv entries are file names.
+		options.append(opt)
+		options.append("%s%s" % (opt, arg))
+		options.append("%s=%s" % (opt, arg))
+		arguments.append(arg)
+	# Treat any command line options not processed by getopt as target file paths.
+	for opt in sys.argv[1:]:
+		if opt not in arguments and opt not in options and not opt.startswith('-'):
+			file_list.append(opt)
+	if len(file_list) >= 2:
+		rehash = hashmatch.HashMatch(cutoff=cutoff,
+						strings=strings,
+						symlinks=symlinks, 
+						name=name, 
+						same=same,
+						missing=missing,
+						max_results=max_results,
+						verbose=verbose)
+		if os.path.isfile(file_list[0]):
+			if not all_types and len(types) == 0:
+				m = magic.open(0)
+				m.load()
+				file_type = m.file(file_list[0])
+				if file_type:
+					types[True] = re.escape(file_type.lower())
+			if os.path.isfile(file_list[1]):
+				results = rehash.files(file_list[0], file_list[1])
+			else:
+				results = rehash.file(file_list[0], file_list[1:])
+		else:
+			for f in file_list:
+				if not os.path.isdir(f):
+					print("Invalid usage")
+					usage(sys.stderr)
+			results = rehash.directories(file_list[0], file_list[1])
+	for (match, fname) in results:
+		print("%s  %s" % (match, fname))
+if __name__ == "__main__":
+	main()
--- a/src/binwalk/hashmatch.py
+++ b/src/binwalk/hashmatch.py
@@ -20,24 +20,24 @@ class HashMatch(object):
 	FUZZY_DEFAULT_CUTOFF = 50
-	def __init__(self, cutoff=None, fuzzy=True, strings=False, same=False, missing=False, symlinks=False, name=False, matches={}, types={}):
+	def __init__(self, cutoff=None, strings=False, same=False, missing=False, symlinks=False, name=False, max_results=None, matches={}, types={}, verbose=False):
 		'''
 		Class constructor.
 		@cutoff          - The fuzzy cutoff which determines if files are different or not.
-		@fuzzy           - Set to True to do fuzzy hashing; set to False to do traditional hashing.
 		@strings         - Only hash strings inside of the file, not the entire file itself.
 		@same            - Set to True to show files that are the same, False to show files that are different.
 		@missing         - Set to True to show missing files.
 		@symlinks        - Set to True to include symbolic link files.
 		@name            - Set to True to only compare files whose base names match.
+		@max_results     - Stop searching after x number of matches.
 		@matches         - A dictionary of file names to diff.
 		@types           - A dictionary of file types to diff.
+		@verbose         - Enable verbose mode.
 		Returns None.
 		'''
 		self.cutoff = cutoff
-		self.fuzzy = fuzzy
 		self.strings = strings
 		self.show_same = same
 		self.show_missing = missing
@@ -45,6 +45,10 @@ class HashMatch(object):
 		self.matches = matches
 		self.name = name
 		self.types = types
+		self.max_results = max_results
+		self.verbose = verbose
+		self.total = 0
 		self.magic = magic.open(0)
 		self.magic.load()
@@ -58,9 +62,13 @@ class HashMatch(object):
 			self.types[k] = re.compile(self.types[k])
 	def _get_strings(self, fname):
-		return ''.join([string for (offset, string) in binwalk.smartstrings.FileStrings(fname, n=10).strings()])
+		return ''.join([string for (offset, string) in binwalk.smartstrings.FileStrings(fname, n=10, block=None).strings()])
-	def files(self, file1, file2):
+	def _print(self, message):
+		if self.verbose:
+			print(message)
+	def _compare_files(self, file1, file2):
 		'''
 		Fuzzy diff two files.
@@ -73,7 +81,10 @@ class HashMatch(object):
 		status = 0
 		if not self.name or os.path.basename(file1) == os.path.basename(file2):
-			if self.fuzzy:
+			if os.path.exists(file1) and os.path.exists(file2):
+				self._print("Checking %s -> %s" % (file1, file2))
 				hash1 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
 				hash2 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
@@ -98,11 +109,7 @@ class HashMatch(object):
 						else:
 							return self.lib.fuzzy_compare(hash1, hash2)
 				except Exception as e:
-					print "WARNING: Exception while performing fuzzy comparison:", e
+					print "WARNING: Exception while doing fuzzy hash:", e
-			elif not self.strings:
-				if file_md5(file1) == file_md5(file2):
-					return 100
 		return None
@@ -111,7 +118,7 @@ class HashMatch(object):
 		Returns True if the match value is greater than or equal to the cutoff.
 		Returns False if the match value is less than the cutoff.
 		'''
-		return (match >= self.cutoff)
+		return (match is not None and match >= self.cutoff)
 	def _get_file_list(self, directory):
 		'''
@@ -141,7 +148,11 @@ class HashMatch(object):
 				if self.types:
 					for f in files:
 						for (include, type_regex) in iterator(self.types):
-							magic_result = self.magic.file(f).lower()
+							try:
+								magic_result = self.magic.file(os.path.join(directory, f)).lower()
+							except Exception as e:
+								magic_result = ''
 							match = type_regex.match(magic_result)
 							# If this matched an include filter, or didn't match an exclude filter
@@ -162,6 +173,12 @@ class HashMatch(object):
 		return set(file_list)
+	def files(self, file1, file2):
+		m = self._compare_files(file1, file2)
+		if m is None:
+			m = 0
+		return [(m, file2)]
 	def file(self, fname, directories):
 		'''
 		Search for a particular file in multiple directories.
@@ -172,45 +189,57 @@ class HashMatch(object):
 		Returns a list of tuple results.
 		'''
 		matching_files = []
+		self.total = 0
 		for directory in directories:
 			for f in self._get_file_list(directory):
 				f = os.path.join(directory, f)
-				m = self.files(fname, f)
+				m = self._compare_files(fname, f)
-				if self.is_match(m):
+				if m is not None and self.is_match(m):
 					matching_files.append((m, f))
+					self.total += 1
+					if self.max_results and self.total >= self.max_results:
+						return matching_files
 		return matching_files
-	def directories(self, dir1, dir2):
+	def directories(self, source, dir_list):
 		'''
 		Search two directories for matching files.
-		@dir1 - First directory.
+		@source   - Source directory to compare everything to.
-		@dir2 - Second directory.
+		@dir_list - Compare files in source to files in these directories.
 		Returns a list of tuple results.
 		'''
 		results = []
+		self.total = 0
+		source_files = self._get_file_list(source)
-		dir1_files = self._get_file_list(dir1)
+		for directory in dir_list:
-		dir2_files = self._get_file_list(dir2)
+			dir_files = self._get_file_list(directory)
+			for f in source_files:
+				if f in dir_files:
+					file1 = os.path.join(source, f)
+					file2 = os.path.join(directory, f)
-		for f in dir1_files:
+					m = self._compare_files(file1, file2)
-			if f in dir2_files:
+					if m is not None:
-				file1 = os.path.join(dir1, f)
+						matches = self.is_match(m)
-				file2 = os.path.join(dir2, f)
-				m = self.files(file1, file2)
+						if (matches and self.show_same) or (not matches and not self.show_same):
-				if m is not None:
+							results.append(("%3d" % m, f))
-					matches = self.is_match(m)
-					if (matches and self.show_same) or (not matches and not self.show_same):
+							self.total += 1
-						results.append(("%3d" % m, f))
+							if self.max_results and self.total >= self.max_results:
+								return results
-		if self.show_missing:
+		if self.show_missing and len(dir_list) == 1:
-			results += [('---', f) for f in (dir1_files-dir2_files)]
+			results += [('---', f) for f in (source_files-dir_files)]
-			results += [('+++', f) for f in (dir2_files-dir1_files)]
+			results += [('+++', f) for f in (dir_files-source_files)]
 		return results
@@ -218,7 +247,7 @@ class HashMatch(object):
 if __name__ == '__main__':
 	import sys
-	hmatch = HashMatch(strings=True, name=True)
+	hmatch = HashMatch(strings=True, name=False, types={True:"^elf"})
 	print hmatch.file(sys.argv[1], sys.argv[2:])
 	#for (match, fname) in hmatch.directories(sys.argv[1], sys.argv[2]):
 	#for (match, fname) in hmatch.find_file(sys.argv[1], sys.argv[2:]):

--- a/src/binwalk/magic/binwalk
+++ b/src/binwalk/magic/binwalk
--- a/src/binwalk/smartstrings.py
+++ b/src/binwalk/smartstrings.py
@@ -41,7 +41,7 @@ class FileStrings(object):
 		@length    - The number of bytes in the file to analyze.
 		@offset    - The starting offset into the file to begin analysis.
 		@n         - The minimum valid string length.
-		@block     - The block size to use when performing entropy analysis.
+		@block     - The block size to use iwhen performing entropy analysis. Set to None to skip entropy analysis.
 		@algorithm - The entropy algorithm to use when performing entropy analysis.
 		@plugins   - An instance of the Plugins class.
@@ -59,22 +59,31 @@ class FileStrings(object):
 		self.valid_strings = []
 		self.external_validators = []
 		self.plugins = plugins
+		self.block = block
 		if not self.n:
 			self.n = self.MIN_STRING_LENGTH
-		# Perform an entropy analysis over the entire file (anything less may generate poor entropy data).
+		if self.block is not None:
-		# Give fake file results list to prevent FileEntropy from doing too much analysis.
+			# Perform an entropy analysis over the entire file (anything less may generate poor entropy data).
-		with entropy.FileEntropy(file_name, block=block, file_results=['foo']) as e:
+			# Give fake file results list to prevent FileEntropy from doing too much analysis.
-			(self.x, self.y, self.average_entropy) = e.analyze(algorithm=algorithm)
+			with entropy.FileEntropy(file_name, block=self.block, file_results=['foo']) as e:
-			for i in range(0, len(self.x)):
+				(self.x, self.y, self.average_entropy) = e.analyze(algorithm=algorithm)
-				self.entropy[self.x[i]] = self.y[i]
+				for i in range(0, len(self.x)):
-			# Make sure our block size matches the entropy analysis's block size
+					self.entropy[self.x[i]] = self.y[i]
-			self.block = e.block
+				# Make sure our block size matches the entropy analysis's block size
+				self.block = e.block
-		# Make sure the starting offset is a multiple of the block size; else, when later checking
+			# Make sure the starting offset is a multiple of the block size; else, when later checking
-		# the entropy analysis, block offsets won't line up.
+			# the entropy analysis, block offsets won't line up.
-		self.start -= (self.start % self.block)
+			self.start -= (self.start % self.block)
+		else:
+			i = 0
+			self.block = common.BlockFile.READ_BLOCK_SIZE
+			# Fake the entropy scan
+			while i < common.file_size(file_name):
+				self.entropy[i] = 1.0
+				i += self.block
 		self.fd = common.BlockFile(file_name, 'r', length=length, offset=self.start)
 		# TODO: This is not optimal. We should read in larger chunks and process it into self.block chunks.