Updated hashmatch.py to perform strings-based hashing; added snappy stream compression signature.

48a1a48b · devttys0 · a30e51e1 · 48a1a48b · 48a1a48b · 48a1a48b
Commit 48a1a48b authored Dec 04, 2013 by devttys0
Showing with 49 additions and 15 deletions

hashmatch.py src/binwalk/hashmatch.py +32 -9

binwalk src/binwalk/magic/binwalk +0 -0

smartstrings.py src/binwalk/smartstrings.py +13 -6

compressed src/magic/compressed +4 -0

No files found.
--- a/src/binwalk/hashmatch.py
+++ b/src/binwalk/hashmatch.py
 import os
 import re
+import magic
 import fnmatch
 import ctypes
 import ctypes.util
-
-import magic
+import binwalk.smartstrings
 from binwalk.compat import *
 from binwalk.common import file_md5

@@ -20,7 +20,7 @@ class HashMatch(object):

 	FUZZY_DEFAULT_CUTOFF = 50

-	def __init__(self, cutoff=None, fuzzy=True, strings=False, same=False, missing=False, symlinks=False, matches={}, types={}):
+	def __init__(self, cutoff=None, fuzzy=True, strings=False, same=False, missing=False, symlinks=False, name=False, matches={}, types={}):
 		'''
 		Class constructor.

@@ -30,6 +30,7 @@ class HashMatch(object):
 		@same            - Set to True to show files that are the same, False to show files that are different.
 		@missing         - Set to True to show missing files.
 		@symlinks        - Set to True to include symbolic link files.
+		@name            - Set to True to only compare files whose base names match.
 		@matches         - A dictionary of file names to diff.
 		@types           - A dictionary of file types to diff.

@@ -42,6 +43,7 @@ class HashMatch(object):
 		self.show_missing = missing
 		self.symlinks = symlinks
 		self.matches = matches
+		self.name = name
 		self.types = types

 		self.magic = magic.open(0)
@@ -55,6 +57,9 @@ class HashMatch(object):
 		for k in get_keys(self.types):
 			self.types[k] = re.compile(self.types[k])

+	def _get_strings(self, fname):
+		return ''.join([string for (offset, string) in binwalk.smartstrings.FileStrings(fname, n=10).strings()])
+
 	def files(self, file1, file2):
 		'''
 		Fuzzy diff two files.
@@ -65,21 +70,37 @@ class HashMatch(object):
 		Returns the match percentage.	
 		Returns None on error.
 		'''
+		status = 0

+		if not self.name or os.path.basename(file1) == os.path.basename(file2):
 			if self.fuzzy:
 				hash1 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
 				hash2 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)

-			# TODO: Implement strings hashing
 				try:
-				if self.lib.fuzzy_hash_filename(str2bytes(file1), hash1) == 0 and self.lib.fuzzy_hash_filename(str2bytes(file2), hash2) == 0:
+					if self.strings:
+						file1_strings = self._get_strings(file1)
+						file2_strings = self._get_strings(file2)
+
+						if file1_strings == file2_strings:
+							return 100
+						else:
+							status |= self.lib.fuzzy_hash_buf(str2bytes(file1_strings), len(file1_strings), hash1)
+							status |= self.lib.fuzzy_hash_buf(str2bytes(file2_strings), len(file2_strings), hash2)
+						
+					else:
+						status |= self.lib.fuzzy_hash_filename(str2bytes(file1), hash1)
+						status |= self.lib.fuzzy_hash_filename(str2bytes(file2), hash2)
+				
+					if status == 0:
 						if hash1.raw == hash2.raw:
 							return 100
 						else:
 							return self.lib.fuzzy_compare(hash1, hash2)
 				except Exception as e:
 					print "WARNING: Exception while performing fuzzy comparison:", e
-		else:
+
+			elif not self.strings:
 				if file_md5(file1) == file_md5(file2):
 					return 100

@@ -181,6 +202,7 @@ class HashMatch(object):
 				file2 = os.path.join(dir2, f)

 				m = self.files(file1, file2)
+				if m is not None:
 					matches = self.is_match(m)

 					if (matches and self.show_same) or (not matches and not self.show_same):
@@ -196,8 +218,9 @@ class HashMatch(object):
 if __name__ == '__main__':
 	import sys
 	
-	hmatch = HashMatch(missing=True)
+	hmatch = HashMatch(strings=True, name=True)
+	print hmatch.file(sys.argv[1], sys.argv[2:])
 	#for (match, fname) in hmatch.directories(sys.argv[1], sys.argv[2]):
-	for (match, fname) in hmatch.find_file(sys.argv[1], sys.argv[2:]):
-		print match, fname
+	#for (match, fname) in hmatch.find_file(sys.argv[1], sys.argv[2:]):
+	#	print match, fname

--- a/src/binwalk/magic/binwalk
+++ b/src/binwalk/magic/binwalk
--- a/src/binwalk/smartstrings.py
+++ b/src/binwalk/smartstrings.py
@@ -15,6 +15,7 @@ class FileStrings(object):
 	MAX_STRING_LENGTH = 20
 	MAX_SPECIAL_CHARS_RATIO = .4
 	MAX_ENTROPY = 0.9
+	DEFAULT_ENTROPY_BLOCK = 1024

 	LETTERS = set(string.letters)
 	NUMBERS = set(string.digits)
@@ -31,7 +32,7 @@ class FileStrings(object):
 			'(' : ')',
 	}
 	
-	def __init__(self, file_name, binwalk, length=0, offset=0, n=MIN_STRING_LENGTH, block=0, algorithm=None, plugins=None):
+	def __init__(self, file_name, binwalk=None, length=0, offset=0, n=MIN_STRING_LENGTH, block=DEFAULT_ENTROPY_BLOCK, algorithm='gzip', plugins=None):
 		'''
 		Class constructor. Preferred to be invoked from the Strings class instead of directly.

@@ -65,7 +66,7 @@ class FileStrings(object):
 		# Perform an entropy analysis over the entire file (anything less may generate poor entropy data).
 		# Give fake file results list to prevent FileEntropy from doing too much analysis.
 		with entropy.FileEntropy(file_name, block=block, file_results=['foo']) as e:
-			(self.x, self.y, self.average_entropy) = e.analyze()
+			(self.x, self.y, self.average_entropy) = e.analyze(algorithm=algorithm)
 			for i in range(0, len(self.x)):
 				self.entropy[self.x[i]] = self.y[i]
 			# Make sure our block size matches the entropy analysis's block size
@@ -82,6 +83,7 @@ class FileStrings(object):
 		self.start = self.fd.offset

 		# Set the total_scanned and scan_length values for plugins and status display messages
+		if self.binwalk:
 			self.binwalk.total_scanned = 0
 			self.binwalk.scan_length = self.fd.length

@@ -128,6 +130,7 @@ class FileStrings(object):

 		(data, dlen) = self.fd.read_block()

+		if self.binwalk:
 			self.binwalk.total_scanned = self.total_read
 			self.total_read += dlen

@@ -309,6 +312,7 @@ class FileStrings(object):
 				string = results['description']

 			if not ((plug_ret | plug_pre ) & plugins.PLUGIN_NO_DISPLAY):
+				if self.binwalk:
 					self.binwalk.display.results(offset, [results])
 				self.valid_strings.append((offset, string))
 		return plug_ret
@@ -354,7 +358,7 @@ class Strings(object):
 	Class for performing a strings analysis against a list of files.
 	'''

-	def __init__(self, file_names, binwalk, length=0, offset=0, n=0, block=0, algorithm=None, load_plugins=True, whitelist=[], blacklist=[]):
+	def __init__(self, file_names, binwalk=None, length=0, offset=0, n=0, block=0, algorithm=None, load_plugins=True, whitelist=[], blacklist=[]):
 		'''
 		Class constructor.

@@ -378,13 +382,14 @@ class Strings(object):
 		self.n = n
 		self.block = block
 		self.algorithm = algorithm
-		self.binwalk.scan_type = self.binwalk.STRINGS
 		self.file_strings = None
+		self.plugins = None
+		
+		if self.binwalk:
+			self.binwalk.scan_type = self.binwalk.STRINGS

 			if load_plugins:
 				self.plugins = plugins.Plugins(self.binwalk, whitelist=whitelist, blacklist=blacklist)
-		else:
-			self.plugins = None

 	def __enter__(self):
 		return self
@@ -429,7 +434,9 @@ class Strings(object):
 			self.plugins._load_plugins()

 		for file_name in self.file_names:
+			if self.binwalk:
 				self.binwalk.display.header(file_name=file_name, description='Strings')
+
 			results[file_name] = []

 			self.file_strings = FileStrings(file_name, self.binwalk, self.length, self.offset, self.n, block=self.block, algorithm=self.algorithm, plugins=self.plugins)

--- a/src/magic/compressed
+++ b/src/magic/compressed
@@ -165,3 +165,7 @@
 >6	byte&0x10	0x10		multi-block stream

 # See lzma file for LZMA signatures
+
+
+0	string	\xff\x06\x00\x00\x73\x4e\x61\x50\x70\x59	Snappy compression, stream identifier
+