Commit 48a1a48b by devttys0

Updated hashmatch.py to perform strings-based hashing; added snappy stream compression signature.

parent a30e51e1
import os
import re
import magic
import fnmatch
import ctypes
import ctypes.util
import magic
import binwalk.smartstrings
from binwalk.compat import *
from binwalk.common import file_md5
......@@ -20,7 +20,7 @@ class HashMatch(object):
FUZZY_DEFAULT_CUTOFF = 50
def __init__(self, cutoff=None, fuzzy=True, strings=False, same=False, missing=False, symlinks=False, matches={}, types={}):
def __init__(self, cutoff=None, fuzzy=True, strings=False, same=False, missing=False, symlinks=False, name=False, matches={}, types={}):
'''
Class constructor.
......@@ -30,6 +30,7 @@ class HashMatch(object):
@same - Set to True to show files that are the same, False to show files that are different.
@missing - Set to True to show missing files.
@symlinks - Set to True to include symbolic link files.
@name - Set to True to only compare files whose base names match.
@matches - A dictionary of file names to diff.
@types - A dictionary of file types to diff.
......@@ -42,6 +43,7 @@ class HashMatch(object):
self.show_missing = missing
self.symlinks = symlinks
self.matches = matches
self.name = name
self.types = types
self.magic = magic.open(0)
......@@ -55,6 +57,9 @@ class HashMatch(object):
for k in get_keys(self.types):
self.types[k] = re.compile(self.types[k])
def _get_strings(self, fname):
return ''.join([string for (offset, string) in binwalk.smartstrings.FileStrings(fname, n=10).strings()])
def files(self, file1, file2):
'''
Fuzzy diff two files.
......@@ -65,21 +70,37 @@ class HashMatch(object):
Returns the match percentage.
Returns None on error.
'''
status = 0
if not self.name or os.path.basename(file1) == os.path.basename(file2):
if self.fuzzy:
hash1 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
hash2 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT)
# TODO: Implement strings hashing
try:
if self.lib.fuzzy_hash_filename(str2bytes(file1), hash1) == 0 and self.lib.fuzzy_hash_filename(str2bytes(file2), hash2) == 0:
if self.strings:
file1_strings = self._get_strings(file1)
file2_strings = self._get_strings(file2)
if file1_strings == file2_strings:
return 100
else:
status |= self.lib.fuzzy_hash_buf(str2bytes(file1_strings), len(file1_strings), hash1)
status |= self.lib.fuzzy_hash_buf(str2bytes(file2_strings), len(file2_strings), hash2)
else:
status |= self.lib.fuzzy_hash_filename(str2bytes(file1), hash1)
status |= self.lib.fuzzy_hash_filename(str2bytes(file2), hash2)
if status == 0:
if hash1.raw == hash2.raw:
return 100
else:
return self.lib.fuzzy_compare(hash1, hash2)
except Exception as e:
print "WARNING: Exception while performing fuzzy comparison:", e
else:
elif not self.strings:
if file_md5(file1) == file_md5(file2):
return 100
......@@ -181,6 +202,7 @@ class HashMatch(object):
file2 = os.path.join(dir2, f)
m = self.files(file1, file2)
if m is not None:
matches = self.is_match(m)
if (matches and self.show_same) or (not matches and not self.show_same):
......@@ -196,8 +218,9 @@ class HashMatch(object):
if __name__ == '__main__':
import sys
hmatch = HashMatch(missing=True)
hmatch = HashMatch(strings=True, name=True)
print hmatch.file(sys.argv[1], sys.argv[2:])
#for (match, fname) in hmatch.directories(sys.argv[1], sys.argv[2]):
for (match, fname) in hmatch.find_file(sys.argv[1], sys.argv[2:]):
print match, fname
#for (match, fname) in hmatch.find_file(sys.argv[1], sys.argv[2:]):
# print match, fname
......@@ -15,6 +15,7 @@ class FileStrings(object):
MAX_STRING_LENGTH = 20
MAX_SPECIAL_CHARS_RATIO = .4
MAX_ENTROPY = 0.9
DEFAULT_ENTROPY_BLOCK = 1024
LETTERS = set(string.letters)
NUMBERS = set(string.digits)
......@@ -31,7 +32,7 @@ class FileStrings(object):
'(' : ')',
}
def __init__(self, file_name, binwalk, length=0, offset=0, n=MIN_STRING_LENGTH, block=0, algorithm=None, plugins=None):
def __init__(self, file_name, binwalk=None, length=0, offset=0, n=MIN_STRING_LENGTH, block=DEFAULT_ENTROPY_BLOCK, algorithm='gzip', plugins=None):
'''
Class constructor. Preferred to be invoked from the Strings class instead of directly.
......@@ -65,7 +66,7 @@ class FileStrings(object):
# Perform an entropy analysis over the entire file (anything less may generate poor entropy data).
# Give fake file results list to prevent FileEntropy from doing too much analysis.
with entropy.FileEntropy(file_name, block=block, file_results=['foo']) as e:
(self.x, self.y, self.average_entropy) = e.analyze()
(self.x, self.y, self.average_entropy) = e.analyze(algorithm=algorithm)
for i in range(0, len(self.x)):
self.entropy[self.x[i]] = self.y[i]
# Make sure our block size matches the entropy analysis's block size
......@@ -82,6 +83,7 @@ class FileStrings(object):
self.start = self.fd.offset
# Set the total_scanned and scan_length values for plugins and status display messages
if self.binwalk:
self.binwalk.total_scanned = 0
self.binwalk.scan_length = self.fd.length
......@@ -128,6 +130,7 @@ class FileStrings(object):
(data, dlen) = self.fd.read_block()
if self.binwalk:
self.binwalk.total_scanned = self.total_read
self.total_read += dlen
......@@ -309,6 +312,7 @@ class FileStrings(object):
string = results['description']
if not ((plug_ret | plug_pre ) & plugins.PLUGIN_NO_DISPLAY):
if self.binwalk:
self.binwalk.display.results(offset, [results])
self.valid_strings.append((offset, string))
return plug_ret
......@@ -354,7 +358,7 @@ class Strings(object):
Class for performing a strings analysis against a list of files.
'''
def __init__(self, file_names, binwalk, length=0, offset=0, n=0, block=0, algorithm=None, load_plugins=True, whitelist=[], blacklist=[]):
def __init__(self, file_names, binwalk=None, length=0, offset=0, n=0, block=0, algorithm=None, load_plugins=True, whitelist=[], blacklist=[]):
'''
Class constructor.
......@@ -378,13 +382,14 @@ class Strings(object):
self.n = n
self.block = block
self.algorithm = algorithm
self.binwalk.scan_type = self.binwalk.STRINGS
self.file_strings = None
self.plugins = None
if self.binwalk:
self.binwalk.scan_type = self.binwalk.STRINGS
if load_plugins:
self.plugins = plugins.Plugins(self.binwalk, whitelist=whitelist, blacklist=blacklist)
else:
self.plugins = None
def __enter__(self):
return self
......@@ -429,7 +434,9 @@ class Strings(object):
self.plugins._load_plugins()
for file_name in self.file_names:
if self.binwalk:
self.binwalk.display.header(file_name=file_name, description='Strings')
results[file_name] = []
self.file_strings = FileStrings(file_name, self.binwalk, self.length, self.offset, self.n, block=self.block, algorithm=self.algorithm, plugins=self.plugins)
......
......@@ -165,3 +165,7 @@
>6 byte&0x10 0x10 multi-block stream
# See lzma file for LZMA signatures
0 string \xff\x06\x00\x00\x73\x4e\x61\x50\x70\x59 Snappy compression, stream identifier
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment