Commit d51dd6b1 by devttys0

Updated C installer to always build libfuzzy

parent 5839cd8d
...@@ -198,11 +198,11 @@ then ...@@ -198,11 +198,11 @@ then
pyqtgraph pyqtgraph
fi fi
if [ "$(python -c 'import ctypes.util; print (ctypes.util.find_library("fuzzy"))')" == "None" ] #if [ "$(python -c 'import ctypes.util; print (ctypes.util.find_library("fuzzy"))')" == "None" ]
then #then
echo "libfuzzy not installed; building from source..." # echo "libfuzzy not installed; building from source..."
libfuzzy # libfuzzy
fi #fi
# Get and build the firmware mod kit # Get and build the firmware mod kit
fmk fmk
......
...@@ -2,15 +2,38 @@ ...@@ -2,15 +2,38 @@
from __future__ import print_function from __future__ import print_function
import os import os
import sys import sys
import shutil
from distutils.core import setup from distutils.core import setup
from distutils.dir_util import remove_tree
# Python2/3 compliance # Python2/3 compliance
try: try:
raw_input raw_input
except: except NameError:
raw_input = input raw_input = input
def cleanup_build_directory():
# Requires to chdir into the src directory first
try:
remove_tree("build")
except KeyboardInterrupt as e:
raise e
except Exception:
pass
def cleanup_module_directory():
# Installing doesn't remove old files that may have been deleted from the module.
if "install" in sys.argv:
try:
import binwalk
for path in binwalk.__path__:
try:
remove_tree(path + os.path.sep + "*")
except OSError as e:
pass
except ImportError:
pass
# Change to the binwalk src directory
def warning(lines, terminate=True, prompt=True): def warning(lines, terminate=True, prompt=True):
WIDTH = 115 WIDTH = 115
...@@ -36,20 +59,7 @@ if "--yes" in sys.argv: ...@@ -36,20 +59,7 @@ if "--yes" in sys.argv:
else: else:
IGNORE_WARNINGS = False IGNORE_WARNINGS = False
# Look for old installations of binwalk and remove them to prevent conflicts with the new API # cd into the src directory, no matter where setup.py was invoked from
try:
import binwalk
for path in binwalk.__path__:
if not os.path.exists(os.path.join(path, "core")):
try:
print ("Cleaning up old installation...")
shutil.rmtree(path)
except:
pass
except:
pass
# Change to the binwalk src directory
os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "src")) os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "src"))
print("checking pre-requisites") print("checking pre-requisites")
...@@ -57,14 +67,14 @@ try: ...@@ -57,14 +67,14 @@ try:
import magic import magic
try: try:
magic.MAGIC_NO_CHECK_TEXT magic.MAGIC_NO_CHECK_TEXT
except Exception as e: except AttributeError as e:
msg = ["Pre-requisite failure: " + str(e), msg = ["Pre-requisite failure: " + str(e),
"It looks like you have an old or incompatible magic module installed.", "It looks like you have an old or incompatible magic module installed.",
"Please install the official python-magic module, or download and install it from source: ftp://ftp.astron.com/pub/file/" "Please install the official python-magic module, or download and install it from source: ftp://ftp.astron.com/pub/file/"
] ]
warning(msg) warning(msg)
except Exception as e: except ImportError as e:
msg = ["Pre-requisite failure:", str(e), msg = ["Pre-requisite failure:", str(e),
"Please install the python-magic module, or download and install it from source: ftp://ftp.astron.com/pub/file/", "Please install the python-magic module, or download and install it from source: ftp://ftp.astron.com/pub/file/",
] ]
...@@ -73,7 +83,7 @@ except Exception as e: ...@@ -73,7 +83,7 @@ except Exception as e:
try: try:
import pyqtgraph import pyqtgraph
except Exception as e: except ImportError as e:
msg = ["Pre-requisite check warning: " + str(e), msg = ["Pre-requisite check warning: " + str(e),
"To take advantage of this tool's graphing capabilities, please install the pyqtgraph module.", "To take advantage of this tool's graphing capabilities, please install the pyqtgraph module.",
] ]
...@@ -94,23 +104,26 @@ if not os.path.exists(c_lib_makefile): ...@@ -94,23 +104,26 @@ if not os.path.exists(c_lib_makefile):
status |= os.system("make") status |= os.system("make")
if status != 0: if status != 0:
msg = ["Build warning: failed to build compression libraries.", msg = ["Build warning: failed to build C libraries.",
"Some plugins will not work without these libraries." "Some features will not work without these libraries."
] ]
warning(msg, prompt=True) warning(msg, prompt=True)
elif "install" in sys.argv: elif "install" in sys.argv:
if os.system("make install") != 0: if os.system("make install") != 0:
msg = ["Install warning: failed to install compression libraries.", msg = ["Install warning: failed to install C libraries.",
"Some plugins will not work without these libraries." "Some features will not work without these libraries."
] ]
warning(msg, prompt=True) warning(msg, prompt=True)
os.system("make distclean")
os.chdir(working_directory) os.chdir(working_directory)
cleanup_build_directory()
cleanup_module_directory()
# Generate a new magic file from the files in the magic directory # Generate a new magic file from the files in the magic directory
print("generating binwalk magic file") print("creating binwalk magic file")
magic_files = os.listdir("magic") magic_files = os.listdir("magic")
magic_files.sort() magic_files.sort()
fd = open("binwalk/magic/binwalk", "wb") fd = open("binwalk/magic/binwalk", "wb")
...@@ -136,3 +149,5 @@ setup( name = "binwalk", ...@@ -136,3 +149,5 @@ setup( name = "binwalk",
scripts = ["scripts/binwalk"], scripts = ["scripts/binwalk"],
) )
cleanup_build_directory()
...@@ -10,18 +10,22 @@ export INSTALL_OPTIONS=@INSTALL_OPTIONS@ ...@@ -10,18 +10,22 @@ export INSTALL_OPTIONS=@INSTALL_OPTIONS@
all: clean all: clean
make -C miniz make -C miniz
make -C compress make -C compress
make -C fuzzy
install: install:
make -C miniz install make -C miniz install
make -C compress install make -C compress install
make -C fuzzy install
.PHONY: clean distclean .PHONY: clean distclean
clean: clean:
make -C miniz clean make -C miniz clean
make -C compress clean make -C compress clean
make -C fuzzy clean
distclean: distclean:
make -C miniz distclean make -C miniz distclean
make -C compress distclean make -C compress distclean
make -C fuzzy distclean
rm -rf *.cache config.* Makefile rm -rf *.cache config.* Makefile
This source diff could not be displayed because it is too large. You can view the blob instead.
ssdeep was written by Jesse Kornblum and Helmut Grohne.
2013-07-16: Jesse Kornblum <research@jessekornblum.com>:
* fuzzy.c: Fix heap corruption bug #15.
* dig.c: Removed extra call to GetFileAttributes
2013-07-09: Jesse Kornblum <research@jessekornblum.com>:
* dig.cpp, engine.cpp, helpers.cpp: Created separate directory traversal code
for Win32 systems.
2013-06-01: Jesse Kornblum <research@jessekornblum.com>:
* fuzzy.c: Experimental thread-safe patch from Helmut Grohne.
* fuzzy.h: Experimental thread-safe patch from Helmut Grohne.
2013-05-25: Jesse Kornblum <research@jessekornblum.com>:
* fuzzy.c: Fixed bug on string scoring.
2013-03-12: Jesse Kornblum <research@jessekornblum.com>:
* Changelog: Spelled my own name correctly.
* fuzzy.c: Fixed memory leak, bug 3607641.
2012-07-23: Jesse Kornblum <research@jessekornblum.com>:
* main.cpp, match.cpp: Renamed the match_pretty function for clarity.
2012-07-17: Jesse Kornblum <research@jessekornblum.com>:
* match.cpp: Fixing bugs in matching, clustering modes
* main.cpp: Clarifying comments. Added sanity check for -c, -g
2012-07-16: Jesse Kornblum <research@jessekornblum.com>:
* filedata.cpp: Added includes for *nix compilation
2012-07-14 Jesse Kornblum <research@jessekornblum.com>:
* win.sh: Updated for C++ compiler warnings
* normal.sh: Updated for C++ compiler warnings
* fuzzy.c: Corrected logic error for when sigs don't have filenames
* ui.cpp: Added const to print_error's state variable
* match.cpp: Rewrote to use Filedata class
* filedata.cpp: Created Filedata class
* main.cpp: Adapted to use Filedata class
2012-07-13 Jesse Kornblum <research@jessekornblum.com>:
* fuzzy.c: Fixed major bug regarding incorrect match scores for hashes with long filenames
* dig.cpp: Added warning message when not all data on stdin was hashed. Also increased stdin buffer to 512MB.
* configure.ac: Version bump to 2.9.
* main.cpp: Added flags for clustering mode
* ssdeep.h: Added flags for clustering mode
* match.cpp: Setting up for clustering mode
2012-05-25 Jesse Kornblum <research@jessekornblum.com>:
* main.cpp: Updated command line argument processing
* ssdeep.1: Clarifications on description.
2012-05-24 Jesse Kornblum <research@jessekornblum.com>:
* match.cpp: Convert to C++
2012-04-24 Jesse Kornblum <research@jessekornblum.com>:
* ssdeep.1: Updating support for stdin, clarifying modes
* Makefile.am: LF to CR/LF change now done by zip program
* ssdeep.h: Adding Doxygen comments
* match.cpp: Fixed bug in Win32 filename construction
* fuzzy.h: Cleanup and commenting
* engine.cpp: Added const definitions
* Experimental conversion to C++
* Commented out all references to clustering
* main.cpp: Fixed some bugs in the command line argument processing
2012-04-15: Jesse Kornblum <research@jessekornblum.com>:
* main.c: Fixed error handling in getopt processing
2012-02-16 Jesse Kornblum <research@jessekornblum.com>:
* main.c: Bump copyright to 2012 in usage message. Add flag for clustering
* main.h: Added mode_cluster
* ui.c: Changed Win32 to use fputc instead of _tfprintf for filenames
2012-01-12 Jesse Kornblum <research@jessekornblum>:
* ChangeLog: Clean up
2011-10-17 Jesse Kornblum <research@jessekornblum.com>:
* configure.ac: Added AC_SYS_LARGEFILE to handle large files on 32-bit platforms. See bug 3416762.
2011-09-30 Jesse Kornblum <research@jessekornblum.com>:
* ui.c: Redirected error messages to stderr instead of stdout
* main.c: Added warning message when the program does not process any file large enough to produce meaningful results.
2011-09-27 Jesse Kornblum <research@jessekornblum.com>:
* main.c: Added support to process stdin.
* dig.c: Added process_stdin function
* engine.c: Generalized display functions in display_result.
2011-08-19 Jesse Kornblum <research@jessekornblum.com>:
* edit_dist.c: Accepted patch from Brad Spengler to make
thread safe.
2010-07-15 Jesse Kornblum <research@jessekornblum.com>:
* Added quotes and quote escaping to filenames when
displayed in CSV matching mode.
* Modified FILEFORMAT to reflect quotation marks in filenames.
2010-06-24 Jesse Kornblum <research@jessekornblum.com>:
* Added parameter checking to my_basename.
2010-05-05 Jesse Kornblum <research@jessekornblum.com>:
* Changed logic for reading files of known hashes to look for
the header this version of the program writes instead of
the v1 header. This was done for the DC3 branch of the code.
* Updated README documentation and published version 2.5.
2010-03-20 Jesse Kornblum <research@jessekornblum.com>:
* Fixed define in fuzzy.h to only allow one compilation
2010-03-19 Jesse Kornblum <research@jessekornblum.com>:
* Added 'extern "C"' definitions to fuzzy.h for C++ compatibility
* Added return values indicating errors to API functions.
* Modified sample program to use errors on return values
* Added Doxygen documentation for API function.
* Added inttypes.h include to fuzzy.h.
* Added error checking for NULL strings in API functions.
* Version bump to 2.5
2010-02-24 Jesse Kornblum <research@jessekornblum.com>:
* Experimenting with mode to compare unknown hashes to known
without comparing the unknown against each other.
* Version bump to 2.4
2009-01-20 Jesse Kornblum <research@jessekornblum.com>:
* Added -a mode to display all matches regardless of score.
2009-10-11 Jesse Kornblum <research@jessekornblum.com>:
* Fixed typo in usage page.
2009-07-14 Jesse Kornblum <research@jessekornblum.com>:
* Fixed bug that prevented -x mode from working on Win32
* Added web pages to trunk. Updated quickstart guide with
automatic installation options and -x mode.
* Version bump to 2.2 and updated man page, NEWS
2009-07-11 Jesse Kornblum <research@jessekornblum.com>:
* Cleaned up -x mode to compare two (or more) files of
signatures.
* Cleaned up some code comments and Remove Before Flight tags
2009-04-18 Jesse Kornblum <research@jessekornblum.com>:
* Experimenting with -x mode to compare two (or more)
files of signatures
* Added some parameter validation code
2009-01-01 Jesse Kornblum <research@jessekornblum.com>:
* Added fuzzy_hash_filename function to hash a file given
its filename. This avoids issues passing FILE * structures
on Win32 systems in programs not using the fopen convention.
See feature request 2142005.
* Reconfigured all files using latest autoconf tools
2008-09-23 Jesse Kornblum <research@jessekornblum.com>:
* Reinstated the code to call match_pretty() at the end of
main. This lets the -p and -d modes to display output and
fixes bug 2124423.
* Version bump to 2.1.
2008-04-06 Jesse Kornblum <research@jessekornblum.com>:
* Updated man page to include -t and -c modes.
2008-03-04 Jesse Kornblum <research@jessekornblum.com>:
* Made b64 variable static in fuzzy.c
2008-02-29 Jesse Kornblum <research@jessekornblum.com>:
* Version bump to 2.0, reconfigured.
2008-02-22 Jesse Kornblum <research@jessekornblum.com>:
* Changed reading of known hash files back to using
unsigned char values. This also required updating
the match_compare function to handle Unicode
characters when displaying match results.
* Flipped files in Win32 zip file to have CR/LF
2008-02-18 Jesse Kornblum <research@jessekornblum.com>:
* Added man pages to the EXTRA_DIST Makefile variable.
Still need to reconfigure.
* Added more documentation to the Windows zip file along
with the sample.c file
* Removed extraneous recongfiguration from Makefile.am
2008-02-17 Jesse Kornblum <research@jessekornblum.com>:
* Updated build system to create Win32 DLL, documentation,
and packages.
* Added check in print_error functions to see if the state
is valid.
* Updated documentation, usage message.
* Added signature comparison example to sample program.
2008-02-16 Jesse Kornblum <research@jessekornblum.com>:
* Removed block_size value from state variable.
Nobody was using it.
* Ammended API functions to support hashing either a buffer
or an open file handle
* Moved ssdeep code into engine.c, moved fuzzy hashing
code into fuzzy.c. This will help us create libfuzzy
and fuzzy.dll.
* Lots of cleanup to fuzzy hashing code. This includes removing
types like 'uchar' and replacing them with C99 types like
unsigned char. Less work for mother means less work debugging.
* Moved definition of __progname to ssdeep.h. It's not being
used by the fuzzy hashing library and caused problems on OS X.
* Brought over code to support Unicode from Miss Identify
* Version bump to 2.0 beta1
* Added sample program to demonstrate API features
2008-02-15 Jesse Kornblum <research@jessekornblum.com>:
* Changing Win32 build to create DLL. All other
versions should have library/header files installed
2008-02-14 Jesse Kornblum <research@jessekornblum.com>:
* Moved to autotools structure
SSDEEP FILE FORMAT VERSION 1.1
1. REVISION HISTORY
14 Aug 2006 - Initial version (jk)
15 Jul 2010 - Adding quotation marks to filenames
2. FILE HEADER
The first line of the file is a header, like this:
ssdeep,1.1--blocksize:hash:hash,filename
ssdeep - Identifies the file type
1.1 - The version of the file format, NOT the version of the program
-- - Separator
The remainder of the line identifies the format of the file.
Note that for version 1.1 these values must be given EXACTLY as shown above
3. FILE DATA
Each line represents the hash of one file as listed in the header.
Specifically, we have the blocksize used by the program, the hash
for this blocksize and twice the blocksize, and the filename. Filenames
are enclosed in quotation marks. Filenames which contain a quotation mark
will have those quotes slash escaped. For example, the file ma"in.c
will be listed as:
"ma\"in.c"
Installation Instructions
*************************
Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
2006 Free Software Foundation, Inc.
This file is free documentation; the Free Software Foundation gives
unlimited permission to copy, distribute and modify it.
Basic Installation
==================
Briefly, the shell commands `./configure; make; make install' should
configure, build, and install this package. The following
more-detailed instructions are generic; see the `README' file for
instructions specific to this package.
The `configure' shell script attempts to guess correct values for
various system-dependent variables used during compilation. It uses
those values to create a `Makefile' in each directory of the package.
It may also create one or more `.h' files containing system-dependent
definitions. Finally, it creates a shell script `config.status' that
you can run in the future to recreate the current configuration, and a
file `config.log' containing compiler output (useful mainly for
debugging `configure').
It can also use an optional file (typically called `config.cache'
and enabled with `--cache-file=config.cache' or simply `-C') that saves
the results of its tests to speed up reconfiguring. Caching is
disabled by default to prevent problems with accidental use of stale
cache files.
If you need to do unusual things to compile the package, please try
to figure out how `configure' could check whether to do them, and mail
diffs or instructions to the address given in the `README' so they can
be considered for the next release. If you are using the cache, and at
some point `config.cache' contains results you don't want to keep, you
may remove or edit it.
The file `configure.ac' (or `configure.in') is used to create
`configure' by a program called `autoconf'. You need `configure.ac' if
you want to change it or regenerate `configure' using a newer version
of `autoconf'.
The simplest way to compile this package is:
1. `cd' to the directory containing the package's source code and type
`./configure' to configure the package for your system.
Running `configure' might take a while. While running, it prints
some messages telling which features it is checking for.
2. Type `make' to compile the package.
3. Optionally, type `make check' to run any self-tests that come with
the package.
4. Type `make install' to install the programs and any data files and
documentation.
5. You can remove the program binaries and object files from the
source code directory by typing `make clean'. To also remove the
files that `configure' created (so you can compile the package for
a different kind of computer), type `make distclean'. There is
also a `make maintainer-clean' target, but that is intended mainly
for the package's developers. If you use it, you may have to get
all sorts of other programs in order to regenerate files that came
with the distribution.
Compilers and Options
=====================
Some systems require unusual options for compilation or linking that the
`configure' script does not know about. Run `./configure --help' for
details on some of the pertinent environment variables.
You can give `configure' initial values for configuration parameters
by setting variables in the command line or in the environment. Here
is an example:
./configure CC=c99 CFLAGS=-g LIBS=-lposix
*Note Defining Variables::, for more details.
Compiling For Multiple Architectures
====================================
You can compile the package for more than one kind of computer at the
same time, by placing the object files for each architecture in their
own directory. To do this, you can use GNU `make'. `cd' to the
directory where you want the object files and executables to go and run
the `configure' script. `configure' automatically checks for the
source code in the directory that `configure' is in and in `..'.
With a non-GNU `make', it is safer to compile the package for one
architecture at a time in the source code directory. After you have
installed the package for one architecture, use `make distclean' before
reconfiguring for another architecture.
Installation Names
==================
By default, `make install' installs the package's commands under
`/usr/local/bin', include files under `/usr/local/include', etc. You
can specify an installation prefix other than `/usr/local' by giving
`configure' the option `--prefix=PREFIX'.
You can specify separate installation prefixes for
architecture-specific files and architecture-independent files. If you
pass the option `--exec-prefix=PREFIX' to `configure', the package uses
PREFIX as the prefix for installing programs and libraries.
Documentation and other data files still use the regular prefix.
In addition, if you use an unusual directory layout you can give
options like `--bindir=DIR' to specify different values for particular
kinds of files. Run `configure --help' for a list of the directories
you can set and what kinds of files go in them.
If the package supports it, you can cause programs to be installed
with an extra prefix or suffix on their names by giving `configure' the
option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
Optional Features
=================
Some packages pay attention to `--enable-FEATURE' options to
`configure', where FEATURE indicates an optional part of the package.
They may also pay attention to `--with-PACKAGE' options, where PACKAGE
is something like `gnu-as' or `x' (for the X Window System). The
`README' should mention any `--enable-' and `--with-' options that the
package recognizes.
For packages that use the X Window System, `configure' can usually
find the X include and library files automatically, but if it doesn't,
you can use the `configure' options `--x-includes=DIR' and
`--x-libraries=DIR' to specify their locations.
Specifying the System Type
==========================
There may be some features `configure' cannot figure out automatically,
but needs to determine by the type of machine the package will run on.
Usually, assuming the package is built to be run on the _same_
architectures, `configure' can figure that out, but if it prints a
message saying it cannot guess the machine type, give it the
`--build=TYPE' option. TYPE can either be a short name for the system
type, such as `sun4', or a canonical name which has the form:
CPU-COMPANY-SYSTEM
where SYSTEM can have one of these forms:
OS KERNEL-OS
See the file `config.sub' for the possible values of each field. If
`config.sub' isn't included in this package, then this package doesn't
need to know the machine type.
If you are _building_ compiler tools for cross-compiling, you should
use the option `--target=TYPE' to select the type of system they will
produce code for.
If you want to _use_ a cross compiler, that generates code for a
platform different from the build platform, you should specify the
"host" platform (i.e., that on which the generated programs will
eventually be run) with `--host=TYPE'.
Sharing Defaults
================
If you want to set default values for `configure' scripts to share, you
can create a site shell script called `config.site' that gives default
values for variables like `CC', `cache_file', and `prefix'.
`configure' looks for `PREFIX/share/config.site' if it exists, then
`PREFIX/etc/config.site' if it exists. Or, you can set the
`CONFIG_SITE' environment variable to the location of the site script.
A warning: not all `configure' scripts look for a site script.
Defining Variables
==================
Variables not defined in a site shell script can be set in the
environment passed to `configure'. However, some packages may run
configure again during the build, and the customized values of these
variables may be lost. In order to avoid this problem, you should set
them in the `configure' command line, using `VAR=value'. For example:
./configure CC=/usr/local2/bin/gcc
causes the specified `gcc' to be used as the C compiler (unless it is
overridden in the site shell script).
Unfortunately, this technique does not work for `CONFIG_SHELL' due to
an Autoconf bug. Until the bug is fixed you can use this workaround:
CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
`configure' Invocation
======================
`configure' recognizes the following options to control how it operates.
`--help'
`-h'
Print a summary of the options to `configure', and exit.
`--version'
`-V'
Print the version of Autoconf used to generate the `configure'
script, and exit.
`--cache-file=FILE'
Enable the cache: use and save the results of the tests in FILE,
traditionally `config.cache'. FILE defaults to `/dev/null' to
disable caching.
`--config-cache'
`-C'
Alias for `--cache-file=config.cache'.
`--quiet'
`--silent'
`-q'
Do not print messages saying which checks are being made. To
suppress all normal output, redirect it to `/dev/null' (any error
messages will still be shown).
`--srcdir=DIR'
Look for the package's source code in directory DIR. Usually
`configure' can determine that directory automatically.
`configure' also accepts some other, not widely useful, options. Run
`configure --help' for more details.
bin_PROGRAMS=ssdeep
ssdeep_LDADD=libfuzzy.la
ssdeep_LDFLAGS=-static
ACLOCAL_AMFLAGS = -I m4
lib_LTLIBRARIES=libfuzzy.la
libfuzzy_la_SOURCES=fuzzy.c edit_dist.c find-file-size.c
libfuzzy_la_LDFLAGS=-no-undefined -version-info 2:0:0
include_HEADERS=fuzzy.h
man_MANS=ssdeep.1
ssdeep_SOURCES = main.cpp match.cpp engine.cpp filedata.cpp \
dig.cpp cycles.cpp helpers.cpp ui.cpp \
main.h fuzzy.h tchar-local.h ssdeep.h filedata.h match.h
dll: $(libfuzzy_la_SOURCES)
$(CC) $(CFLAGS) -shared -o fuzzy.dll $(libfuzzy_la_SOURCES) \
-Wl,--output-def,fuzzy.def,--out-implib,libfuzzy.a
$(STRIP) fuzzy.dll
CLEANFILES=fuzzy.dll fuzzy.def
EXTRA_DIST=$(man_MANS) config.guess config.sub sample.c FILEFORMAT
WINDOWSDOCS=README.TXT API.TXT FILEFORMAT.TXT NEWS.TXT
README.TXT: ssdeep.1
man ./ssdeep.1 | col -bx > README.TXT
API.TXT: README
cp README API.TXT
FILEFORMAT.TXT: FILEFORMAT
cp FILEFORMAT FILEFORMAT.TXT
NEWS.TXT: NEWS
cp NEWS NEWS.TXT
win-docs: $(WINDOWSDOCS)
# flip -d $(WINDOWSDOCS)
# unix2dos $(WINDOWSDOCS)
win-package: win-docs
rm -rf $(distdir).zip $(distdir)
make
make dll
$(STRIP) ssdeep.exe
mkdir $(distdir)
cp $(WINDOWSDOCS) ssdeep.exe fuzzy.dll fuzzy.def sample.c $(distdir)
# flip -d $(distdir)/{sample.c,fuzzy.def}
# unix2dos $(distdir)/{sample.c,fuzzy.def}
zip -lr9 $(distdir).zip $(distdir)
rm -rf $(distdir) $(WINDOWSDOCS)
world: distclean
./configure --host=i386-mingw32
make win-package
make dist
# Only generic routines go below this line
# ------------------------------------------------------------------
nice:
rm -f *~
preflight:
@grep RBF $(DISTFILES)
** Version 2.10 - 17 Jul 2013
* New Features
- Fuzzy Hashing engine re-written to be thread safe.
* Bug Fixes
- Able to handle long file paths on Win32.
- Fixed bug on comparing signatures with the same block size.
- Fixed crash on comparing short signatures.
- Fixed memory leak
** Version 2.9 - 23 Jul 2012
* New Features
- Added warning message for when some data on stdin is not hashed.
- Can now hash up to 512MB of data on stdin.
- Added clustering mode to group together matching files
* Bug Fixes
- Fixed incorrect match scores for hashes with long filenames.
** Version 2.8 - 25 May 2012
* New Features
- Converted to C++
* Bug Fixes
- Fixed filename display on Win32.
- Fixed support for large files on some platforms.
- Fixed errors in handling command line argument processing.
** Version 2.7 - 30 Sep 2011
* New Features
- Added the capability to process the first 100MB of data
from standard input.
- Added a warning message when the program does not process
any file large enough to produce a meaningful result.
* Bug Fixes
- Standard errors are now sent to stderr, not stdout.
** Version 2.6 - 28 Sep 2010
* New Features
- Modified the output file format to allow for proper escaping of
filenames with quotation marks in them.
* Bug Fixes
- Added quotation marks to filenames in CSV matching mode.
** Version 2.5 - 6 May 2010
* New Features
- Added API documentation
- Added return values indicating errors in API functions
- Added compatibility for compiling with C++
* Bug Fixes
- Added parameter validation to API functions
- Fixed some cosmetic errors in error handling
** Version 2.4 - 25 Feb 2010
* New Features
- Added -k mode to compare unknown signatures against known signatures.
** Version 2.3 - 10 Jan 2010
* New Features
- Added -a mode to display all 'matches', regardless of score.
** Version 2.2 - 22 Jul 2009
* New Features
- Added capability to compare two or more files containing signatures
against one another.
* Bug Fixes
- Changed default behavior to exit program on invalid command line flags
** Version 2.1 - 1 Jan 2009
* New Features
- Added fuzzy_hash_filename function to hash an entire file given
only its filename. Avoids issues on Win32 systems.
* Bug Fixes
- Fixed -p mode to display output
** Version 2.0 - 2 Apr 2008
* New Features
- Created fuzzy hashing API/DLL
- Added support for filenames with Unicode characters on Win32
- Added threshold mode
- Added CSV mode
* Bug Fixes
- Fixed extra characters appearing during verbose mode
** Version 1.1 - 14 Aug 2006
* New Features
- First public release
- Added verbose mode to display filenames as they're being hashed
- Added -d mode to make finding similar files in the same directory tree
both easier and faster. Removes the need for two command lines and
many extraneous lines of output.
- Added -p mode to improve -d mode. Prints bi-directional matches together
and omits self matches.
- Added LARGEFILE_SOURCE define to Linux version to allow processing
of large files. (You never know...)
* Bug Fixes
- Fixed cosmetic errors in usage message. Updated man page.
** Version 1.0 - 31 Mar 2006
* New Features
- Released internally
- Added silent mode, -s. All error messages are suppressed.
* Bug Fixes
- Fixed failure to close files after reading in engine.c
- Fixed routine to read headers of matching hashes on Windows.
- Fixed handling of symbolic links
- Fixed cosmetic bug to display error messages if file open fails
(e.g. Permission denied, etc)
- Removed quotation marks from the signatures but not the file names.
Filenames may contain spaces, but signatures may not. Two bytes
per line adds up when we starting compiling large hash sets.
- Redirected all error messages to stderr instead of stdout
- Removed duplicate defines at the start of engine.c
- Replaced all references to u32 with C99 standard uint32_t
- Added error checking for memory allocation in main.c:main() and
engine.c:hash_file()
- Removed useless logical AND of 0xFFFFFFFF from rolling hash update
** Version 0.1 - 4 Nov 2005
* New Features
- Proof of concept
- This version supports recursion, relative and bare file names, and
can perform positive matching using a previous output.
**** FUZZY HASHING API ****
This file documents the fuzzy hashing API. Information on how to use the
fuzzy hashing program ssdeep can be found in the man page. On *nix
systems you can view this file with:
$ man ./ssdeep.1
Windows users can get the ssdeep usage information from README.TXT.
** Using the API in Your Own Progrms **
You can use the fuzzy hashing API in your own programs by doing
the following:
1. Include the fuzzy hashing header
#include <fuzzy.h>
2. Call one of the functions:
* Fuzzy hashing a buffer of text:
int fuzzy_hash_buf(const unsigned char *buf,
uint32_t buf_len,
char *result);
This function computes the fuzzy hash of the buffer 'buf' and stores the
result in result. You MUST allocate result to hold FUZZY_MAX_RESULT
characters before calling this function. The length of the buffer should
be passed in via buf_len. It is the user's responsibility to append the
filename, if any, to the output. The function returns zero on success,
one on error.
* Fuzzy hashing a file:
There are in fact two ways to fuzzy hash a file. If you already
have an open file handle you can use:
int fuzzy_hash_file(FILE *handle,
char *result);
This function computes the fuzzy hash of the file pointed to by handle
and stores the result in result. You MUST allocate result to hold
FUZZY_MAX_RESULT characters before calling this function. It is the
user's responsibility to append the filename to the output.
The function returns zero on success, one on error.
The other function to hash a file takes a file name:
int fuzzy_hash_filename(const char * filename,
char * result);
Like the function above, this function stores the fuzzy hash result
in the parameter result. You MUST allocate result to hold
FUZZY_MAX_RESULT characters before calling this function.
* Compare two fuzzy hash signatures:
int fuzzy_compare(const char *sig1, const char *sig2);
This function returns a value from 0 to 100 indicating the match
score of the two signatures. A match score of zero indicates the \
sigantures did not match.
3. Compile
To compile the program using gcc:
$ gcc -Wall -I/usr/local/include -L/usr/local/lib sample.c -Lfuzzy
Using mingw:
C:\> gcc -Wall -Ic:\path\to\includes sample.c fuzzy.dll
Using Microsoft Visual C (MSVC):
To paraphrase the MinGW documentation,
http://www.mingw.org/mingwfaq.shtml#faq-msvcdll:
The Windows ssdeep package includes a Win32 DLL and a .def file. Although
MSVC users can't use the DLL directly, they can easily create a .lib file
using the Microsoft LIB tool:
C:\> lib /machine:i386 /def:fuzzy.def
You can then compile your program using the resulting library:
C:\> cl sample.c fuzzy.lib
** Sample Program **
A sample program that uses the API is in sample.c.
** See Also **
- Jesse D. Kornblum, "Identifying almost identical files using context
triggered piecewise hashing", Digital Investigaton, 3(S):91-97,
September 2006, http://dx.doi.org/10.1016/j.diin.2006.06.015,
The Proceedings of the 6th Annual Digital Forensic Research Workshop
\ No newline at end of file
- Update man page
- Update web page, to include new man page
- Write README
- Find a way to estimate device sizes on Windows
Perhaps an IOTCL_DISK_GET_DRIVE_GEOMETRY_EX would work?
- See if Windows Vista's symbolic links create problems for dig.c
This source diff could not be displayed because it is too large. You can view the blob instead.
#! /bin/sh
# Wrapper for compilers which do not understand '-c -o'.
scriptversion=2012-10-14.11; # UTC
# Copyright (C) 1999-2013 Free Software Foundation, Inc.
# Written by Tom Tromey <tromey@cygnus.com>.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
# configuration script generated by Autoconf, you may include it under
# the same distribution terms that you use for the rest of that program.
# This file is maintained in Automake, please report
# bugs to <bug-automake@gnu.org> or send patches to
# <automake-patches@gnu.org>.
nl='
'
# We need space, tab and new line, in precisely that order. Quoting is
# there to prevent tools from complaining about whitespace usage.
IFS=" "" $nl"
file_conv=
# func_file_conv build_file lazy
# Convert a $build file to $host form and store it in $file
# Currently only supports Windows hosts. If the determined conversion
# type is listed in (the comma separated) LAZY, no conversion will
# take place.
func_file_conv ()
{
file=$1
case $file in
/ | /[!/]*) # absolute file, and not a UNC file
if test -z "$file_conv"; then
# lazily determine how to convert abs files
case `uname -s` in
MINGW*)
file_conv=mingw
;;
CYGWIN*)
file_conv=cygwin
;;
*)
file_conv=wine
;;
esac
fi
case $file_conv/,$2, in
*,$file_conv,*)
;;
mingw/*)
file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
;;
cygwin/*)
file=`cygpath -m "$file" || echo "$file"`
;;
wine/*)
file=`winepath -w "$file" || echo "$file"`
;;
esac
;;
esac
}
# func_cl_dashL linkdir
# Make cl look for libraries in LINKDIR
func_cl_dashL ()
{
func_file_conv "$1"
if test -z "$lib_path"; then
lib_path=$file
else
lib_path="$lib_path;$file"
fi
linker_opts="$linker_opts -LIBPATH:$file"
}
# func_cl_dashl library
# Do a library search-path lookup for cl
func_cl_dashl ()
{
lib=$1
found=no
save_IFS=$IFS
IFS=';'
for dir in $lib_path $LIB
do
IFS=$save_IFS
if $shared && test -f "$dir/$lib.dll.lib"; then
found=yes
lib=$dir/$lib.dll.lib
break
fi
if test -f "$dir/$lib.lib"; then
found=yes
lib=$dir/$lib.lib
break
fi
if test -f "$dir/lib$lib.a"; then
found=yes
lib=$dir/lib$lib.a
break
fi
done
IFS=$save_IFS
if test "$found" != yes; then
lib=$lib.lib
fi
}
# func_cl_wrapper cl arg...
# Adjust compile command to suit cl
func_cl_wrapper ()
{
# Assume a capable shell
lib_path=
shared=:
linker_opts=
for arg
do
if test -n "$eat"; then
eat=
else
case $1 in
-o)
# configure might choose to run compile as 'compile cc -o foo foo.c'.
eat=1
case $2 in
*.o | *.[oO][bB][jJ])
func_file_conv "$2"
set x "$@" -Fo"$file"
shift
;;
*)
func_file_conv "$2"
set x "$@" -Fe"$file"
shift
;;
esac
;;
-I)
eat=1
func_file_conv "$2" mingw
set x "$@" -I"$file"
shift
;;
-I*)
func_file_conv "${1#-I}" mingw
set x "$@" -I"$file"
shift
;;
-l)
eat=1
func_cl_dashl "$2"
set x "$@" "$lib"
shift
;;
-l*)
func_cl_dashl "${1#-l}"
set x "$@" "$lib"
shift
;;
-L)
eat=1
func_cl_dashL "$2"
;;
-L*)
func_cl_dashL "${1#-L}"
;;
-static)
shared=false
;;
-Wl,*)
arg=${1#-Wl,}
save_ifs="$IFS"; IFS=','
for flag in $arg; do
IFS="$save_ifs"
linker_opts="$linker_opts $flag"
done
IFS="$save_ifs"
;;
-Xlinker)
eat=1
linker_opts="$linker_opts $2"
;;
-*)
set x "$@" "$1"
shift
;;
*.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
func_file_conv "$1"
set x "$@" -Tp"$file"
shift
;;
*.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
func_file_conv "$1" mingw
set x "$@" "$file"
shift
;;
*)
set x "$@" "$1"
shift
;;
esac
fi
shift
done
if test -n "$linker_opts"; then
linker_opts="-link$linker_opts"
fi
exec "$@" $linker_opts
exit 1
}
eat=
case $1 in
'')
echo "$0: No command. Try '$0 --help' for more information." 1>&2
exit 1;
;;
-h | --h*)
cat <<\EOF
Usage: compile [--help] [--version] PROGRAM [ARGS]
Wrapper for compilers which do not understand '-c -o'.
Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
arguments, and rename the output as expected.
If you are trying to build a whole package this is not the
right script to run: please start by reading the file 'INSTALL'.
Report bugs to <bug-automake@gnu.org>.
EOF
exit $?
;;
-v | --v*)
echo "compile $scriptversion"
exit $?
;;
cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
func_cl_wrapper "$@" # Doesn't return...
;;
esac
ofile=
cfile=
for arg
do
if test -n "$eat"; then
eat=
else
case $1 in
-o)
# configure might choose to run compile as 'compile cc -o foo foo.c'.
# So we strip '-o arg' only if arg is an object.
eat=1
case $2 in
*.o | *.obj)
ofile=$2
;;
*)
set x "$@" -o "$2"
shift
;;
esac
;;
*.c)
cfile=$1
set x "$@" "$1"
shift
;;
*)
set x "$@" "$1"
shift
;;
esac
fi
shift
done
if test -z "$ofile" || test -z "$cfile"; then
# If no '-o' option was seen then we might have been invoked from a
# pattern rule where we don't need one. That is ok -- this is a
# normal compilation that the losing compiler can handle. If no
# '.c' file was seen then we are probably linking. That is also
# ok.
exec "$@"
fi
# Name of file we expect compiler to create.
cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
# Create the lock directory.
# Note: use '[/\\:.-]' here to ensure that we don't use the same name
# that we are using for the .o file. Also, base the name on the expected
# object file name, since that is what matters with a parallel build.
lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
while true; do
if mkdir "$lockdir" >/dev/null 2>&1; then
break
fi
sleep 1
done
# FIXME: race condition here if user kills between mkdir and trap.
trap "rmdir '$lockdir'; exit 1" 1 2 15
# Run the compile.
"$@"
ret=$?
if test -f "$cofile"; then
test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
elif test -f "${cofile}bj"; then
test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
fi
rmdir "$lockdir"
exit $ret
# Local Variables:
# mode: shell-script
# sh-indentation: 2
# eval: (add-hook 'write-file-hooks 'time-stamp)
# time-stamp-start: "scriptversion="
# time-stamp-format: "%:y-%02m-%02d.%02H"
# time-stamp-time-zone: "UTC"
# time-stamp-end: "; # UTC"
# End:
/* config.h.in. Generated from configure.ac by autoheader. */
/* Define if building universal (internal helper macro) */
#undef AC_APPLE_UNIVERSAL_BUILD
/* Define to 1 if you have the <dirent.h> header file. */
#undef HAVE_DIRENT_H
/* Define to 1 if you have the <dlfcn.h> header file. */
#undef HAVE_DLFCN_H
/* Define to 1 if you have the <fcntl.h> header file. */
#undef HAVE_FCNTL_H
/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
#undef HAVE_FSEEKO
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
/* Define to 1 if you have the <libgen.h> header file. */
#undef HAVE_LIBGEN_H
/* Define to 1 if you have the <memory.h> header file. */
#undef HAVE_MEMORY_H
/* Define to 1 if you have the <stdint.h> header file. */
#undef HAVE_STDINT_H
/* Define to 1 if you have the <stdlib.h> header file. */
#undef HAVE_STDLIB_H
/* Define to 1 if you have the <strings.h> header file. */
#undef HAVE_STRINGS_H
/* Define to 1 if you have the <string.h> header file. */
#undef HAVE_STRING_H
/* Define to 1 if you have the <sys/disk.h> header file. */
#undef HAVE_SYS_DISK_H
/* Define to 1 if you have the <sys/ioctl.h> header file. */
#undef HAVE_SYS_IOCTL_H
/* Define to 1 if you have the <sys/mount.h> header file. */
#undef HAVE_SYS_MOUNT_H
/* Define to 1 if you have the <sys/param.h> header file. */
#undef HAVE_SYS_PARAM_H
/* Define to 1 if you have the <sys/stat.h> header file. */
#undef HAVE_SYS_STAT_H
/* Define to 1 if you have the <sys/types.h> header file. */
#undef HAVE_SYS_TYPES_H
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
/* Define to 1 if you have the <wchar.h> header file. */
#undef HAVE_WCHAR_H
/* Define to the sub-directory in which libtool stores uninstalled libraries.
*/
#undef LT_OBJDIR
/* Name of package */
#undef PACKAGE
/* Define to the address where bug reports for this package should be sent. */
#undef PACKAGE_BUGREPORT
/* Define to the full name of this package. */
#undef PACKAGE_NAME
/* Define to the full name and version of this package. */
#undef PACKAGE_STRING
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
/* Define to the home page for this package. */
#undef PACKAGE_URL
/* Define to the version of this package. */
#undef PACKAGE_VERSION
/* Define to 1 if you have the ANSI C header files. */
#undef STDC_HEADERS
/* Version number of package */
#undef VERSION
/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
significant byte first (like Motorola and SPARC, unlike Intel). */
#if defined AC_APPLE_UNIVERSAL_BUILD
# if defined __BIG_ENDIAN__
# define WORDS_BIGENDIAN 1
# endif
#else
# ifndef WORDS_BIGENDIAN
# undef WORDS_BIGENDIAN
# endif
#endif
/* Enable large inode numbers on Mac OS X 10.5. */
#ifndef _DARWIN_USE_64_BIT_INODE
# define _DARWIN_USE_64_BIT_INODE 1
#endif
/* Number of bits in a file offset, on hosts where this is settable. */
#undef _FILE_OFFSET_BITS
/* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
#undef _LARGEFILE_SOURCE
/* Define for large files, on AIX-style hosts. */
#undef _LARGE_FILES
/* Linux operating system functions */
#undef __LINUX__
This source diff could not be displayed because it is too large. You can view the blob instead.
AC_INIT([SSDEEP],[2.10],[research@jessekornblum.com])
AM_INIT_AUTOMAKE
AC_CONFIG_FILES([Makefile])
AM_CONFIG_HEADER([config.h])
AC_CANONICAL_HOST
AC_PROG_CC
AC_PROG_CXX
AC_LIBTOOL_WIN32_DLL
#AC_PROG_LIBTOOL
AM_PROG_LIBTOOL
AC_PROG_INSTALL
AC_CONFIG_MACRO_DIR([m4])
case $host in
*-*-*linux*-*) AC_DEFINE([__LINUX__],1,[Linux operating system functions]) ;;
*-*-mingw32) LIBS="-liberty $LIBS" && CPPFLAGS="-DUNICODE -D_UNICODE $CPPFLAGS"
esac
# Bring additional directories where things might be found into our
# search path. I don't know why autoconf doesn't do this by default
for spfx in /usr/local /opt/local /sw ; do
echo checking ${spfx}/include
if test -d ${spfx}/include; then
CPPFLAGS="-I${spfx}/include $CPPFLAGS"
LDFLAGS="-L${spfx}/lib $LDFLAGS"
fi
done
AC_C_BIGENDIAN
AC_SYS_LARGEFILE
AC_CHECK_HEADERS([libgen.h])
AC_CHECK_HEADERS([dirent.h])
AC_CHECK_HEADERS([inttypes.h])
AC_CHECK_HEADERS([fcntl.h sys/types.h sys/ioctl.h sys/param.h wchar.h unistd.h sys/stat.h sys/disk.h])
AC_CHECK_HEADER([inttypes.h],,AC_MSG_ERROR([You must have inttypes.h or some other C99 equivalent]),)
# These includes are required on FreeBSD
AC_CHECK_HEADERS([sys/mount.h],[],[],
[#ifdef HAVE_SYS_TYPES_H
# include <sys/types.h>
#endif
#ifdef HAVE_SYS_PARAM_H
# include <sys/param.h>
#endif])
AC_FUNC_FSEEKO
AC_OUTPUT
/* MD5DEEP
*
* By Jesse Kornblum
*
* This is a work of the US Government. In accordance with 17 USC 105,
* copyright protection is not available for any work of the US Government.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
*/
// $Id: cycles.cpp 184 2013-07-10 05:24:26Z jessekornblum $
#include "ssdeep.h"
typedef struct dir_table {
TCHAR *name;
struct dir_table *next;
} dir_table;
dir_table *my_table = NULL;
/* This function was used in the dark ages for debugging
static void dump_table(void)
{
struct dir_table *t = my_table;
while (t != NULL)
{
print_status (_TEXT("* %s"), t->name);
t = t->next;
}
print_status ("-- end of table --");
}
*/
int done_processing_dir(TCHAR *fn)
{
dir_table *last, *temp;
TCHAR *d_name = (TCHAR *)malloc(sizeof(TCHAR) * SSDEEP_PATH_MAX);
#ifdef _WIN32
_wfullpath(d_name,fn,SSDEEP_PATH_MAX);
#else
realpath(fn,d_name);
#endif
if (my_table == NULL)
{
internal_error("Table is NULL in done_processing_dir");
// This code never gets executed...
free(d_name);
return FALSE;
}
temp = my_table;
if (!_tcsncmp(d_name,temp->name,SSDEEP_PATH_MAX))
{
my_table = my_table->next;
free(temp->name);
free(temp);
free(d_name);
return TRUE;
}
while (temp->next != NULL)
{
last = temp;
temp = temp->next;
if (!_tcsncmp(d_name,temp->name,SSDEEP_PATH_MAX))
{
last->next = temp->next;
free(temp->name);
free(temp);
free(d_name);
return TRUE;
}
}
internal_error("%s: Directory %s not found in done_processing_dir",
__progname, d_name);
// This code never gets executed...
// free (d_name);
return FALSE;
}
int processing_dir(TCHAR *fn)
{
dir_table *new_dir, *temp;
TCHAR *d_name = (TCHAR *)malloc(sizeof(TCHAR) * SSDEEP_PATH_MAX);
#ifdef _WIN32
_wfullpath(d_name,fn,SSDEEP_PATH_MAX);
#else
realpath(fn,d_name);
#endif
if (my_table == NULL)
{
my_table = (dir_table*)malloc(sizeof(dir_table));
my_table->name = _tcsdup(d_name);
my_table->next = NULL;
free(d_name);
return TRUE;
}
temp = my_table;
while (temp->next != NULL)
{
/* We should never be adding a directory that is already here */
if (!_tcsncmp(temp->name,d_name,SSDEEP_PATH_MAX))
{
internal_error("%s: Attempt to add existing %s in processing_dir",
__progname, d_name);
// Does not execute
free(d_name);
return FALSE;
}
temp = temp->next;
}
new_dir = (dir_table*)malloc(sizeof(dir_table));
new_dir->name = _tcsdup(d_name);
new_dir->next = NULL;
temp->next = new_dir;
free(d_name);
return TRUE;
}
int have_processed_dir(TCHAR *fn)
{
dir_table *temp;
TCHAR *d_name;
if (my_table == NULL)
return FALSE;
d_name = (TCHAR *)malloc(sizeof(TCHAR) * SSDEEP_PATH_MAX);
#ifdef _WIN32
_wfullpath(d_name,fn,SSDEEP_PATH_MAX);
#else
realpath(fn,d_name);
#endif
temp = my_table;
while (temp != NULL)
{
if (!_tcsncmp(temp->name,d_name,SSDEEP_PATH_MAX))
{
free(d_name);
return TRUE;
}
temp = temp->next;
}
free(d_name);
return FALSE;
}
/*
This edit distance code is taken from trn3.6. A few minor
modifications have been made by Andrew Tridgell <tridge@samba.org>
for use in spamsum.
*/
/***************************************************************************/
/* The authors make no claims as to the fitness or correctness of this software
* for any use whatsoever, and it is provided as is. Any use of this software
* is at the user's own risk.
*/
#include <stdio.h>
#include <stdlib.h>
/* edit_dist -- returns the minimum edit distance between two strings
Program by: Mark Maimone CMU Computer Science 13 Nov 89
Last Modified: 28 Jan 90
If the input strings have length n and m, the algorithm runs in time
O(nm) and space O(min(m,n)).
HISTORY
13 Nov 89 (mwm) Created edit_dist() and set_costs().
28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD
computations will work even when THRESHOLD is not a multiple of
sizeof(int).
17 May 93 (mwm) Improved performance when used with trn's newsgroup
processing; assume all costs are 1, and you can terminate when a
threshold is exceeded.
*/
#define MIN_DIST 100
#define TRN_SPEEDUP /* Use a less-general version of the
routine, one that's better for trn.
All change costs are 1, and it's okay
to terminate if the edit distance is
known to exceed MIN_DIST */
#define THRESHOLD 4000 /* worry about allocating more memory only
when this # of bytes is exceeded */
#define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
#define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
#define swap_int(x,y) do { int _iswap = (x); (x) = (y); (y) = _iswap; } while (0)
#define swap_char(x,y) do { const char *_cswap = (x); (x) = (y); (y) = _cswap; } while (0)
static inline int min3(int x, int y, int z) {
return x < y ? (x < z ? x : z) : (z < y) ? z : y;
}
static inline int min2(int x, int y)
{
return x < y ? x : y;
}
static int insert_cost = 1;
static int delete_cost = 1;
#ifndef TRN_SPEEDUP
static int change_cost = 1;
static int swap_cost = 1;
#endif
/* edit_distn -- returns the edit distance between two strings, or -1 on
failure */
int
edit_distn(const char *from, int from_len, const char *to, int to_len)
{
#ifndef TRN_SPEEDUP
register int ins, del, ch; /* local copies of edit costs */
#endif
register int row, col, index; /* dynamic programming counters */
register int radix; /* radix for modular indexing */
#ifdef TRN_SPEEDUP
register int low;
#endif
int *buffer; /* pointer to storage for one row
of the d.p. array */
int store[THRESHOLD / sizeof (int)];
/* a small amount of static
storage, to be used when the
input strings are small enough */
/* Handle trivial cases when one string is empty */
if (from == NULL || !from_len)
if (to == NULL || !to_len)
return 0;
else
return to_len * insert_cost;
else if (to == NULL || !to_len)
return from_len * delete_cost;
/* Initialize registers */
radix = 2 * from_len + 3;
#ifdef TRN_SPEEDUP
#define ins 1
#define del 1
#define ch 3
#define swap_cost 5
#else
ins = insert_cost;
del = delete_cost;
ch = change_cost;
#endif
/* Make from short enough to fit in the static storage, if it's at all
possible */
if (from_len > to_len && from_len > STRLENTHRESHOLD) {
swap_int(from_len, to_len);
swap_char(from, to);
#ifndef TRN_SPEEDUP
swap_int(ins, del);
#endif
} /* if from_len > to_len */
/* Allocate the array storage (from the heap if necessary) */
if (from_len <= STRLENTHRESHOLD)
buffer = store;
else
buffer = (int *) malloc(radix * sizeof (int));
/* Here's where the fun begins. We will find the minimum edit distance
using dynamic programming. We only need to store two rows of the matrix
at a time, since we always progress down the matrix. For example,
given the strings "one" and "two", and insert, delete and change costs
equal to 1:
_ o n e
_ 0 1 2 3
t 1 1 2 3
w 2 2 2 3
o 3 2 3 3
The dynamic programming recursion is defined as follows:
ar(x,0) := x * insert_cost
ar(0,y) := y * delete_cost
ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
a(x - 1, y) + insert_cost,
a(x, y - 1) + delete_cost,
a(x - 2, y - 2) + (from[x] == to[y-1] &&
from[x-1] == to[y] ? swap_cost :
infinity))
Since this only looks at most two rows and three columns back, we need
only store the values for the two preceeding rows. In this
implementation, we do not explicitly store the zero column, so only 2 *
from_len + 2 words are needed. However, in the implementation of the
swap_cost check, the current matrix value is used as a buffer; we
can't overwrite the earlier value until the swap_cost check has
been performed. So we use 2 * from_len + 3 elements in the buffer.
*/
#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
buffer[mod(index)]))
#define NW(x,y) ar(x, y, index + from_len + 2)
#define N(x,y) ar(x, y, index + from_len + 3)
#define W(x,y) ar(x, y, index + radix - 1)
#define NNWW(x,y) ar(x, y, index + 1)
#define mod(x) ((x) % radix)
index = 0;
#ifdef DEBUG_EDITDIST
printf(" ");
for (col = 0; col < from_len; col++)
printf(" %c ", from[col]);
printf("\n ");
for (col = 0; col <= from_len; col++)
printf("%2d ", col * del);
#endif
/* Row 0 is handled implicitly; its value at a given column is col*del.
The loop below computes the values for Row 1. At this point we know the
strings are nonempty. We also don't need to consider swap costs in row
1.
COMMENT: the indicies row and col below point into the STRING, so
the corresponding MATRIX indicies are row+1 and col+1.
*/
buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
#ifdef TRN_SPEEDUP
low = buffer[mod(index + radix - 1)];
#endif
#ifdef DEBUG_EDITDIST
printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
#endif
for (col = 1; col < from_len; col++) {
buffer[index] = min3(
col * del + ((from[col] == to[0]) ? 0 : ch),
(col + 1) * del + ins,
buffer[index - 1] + del);
#ifdef TRN_SPEEDUP
if (buffer[index] < low)
low = buffer[index];
#endif
index++;
#ifdef DEBUG_EDITDIST
printf("%2d ", buffer[index - 1]);
#endif
} /* for col = 1 */
#ifdef DEBUG_EDITDIST
printf("\n %c %2d ", to[1], 2 * ins);
#endif
/* Now handle the rest of the matrix */
for (row = 1; row < to_len; row++) {
for (col = 0; col < from_len; col++) {
buffer[index] = min3(
NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
N(row, col + 1) + ins,
W(row + 1, col) + del);
if (from[col] == to[row - 1] && col > 0 &&
from[col - 1] == to[row])
buffer[index] = min2(buffer[index],
NNWW(row - 1, col - 1) + swap_cost);
#ifdef DEBUG_EDITDIST
printf("%2d ", buffer[index]);
#endif
#ifdef TRN_SPEEDUP
if (buffer[index] < low || col == 0)
low = buffer[index];
#endif
index = mod(index + 1);
} /* for col = 1 */
#ifdef DEBUG_EDITDIST
if (row < to_len - 1)
printf("\n %c %2d ", to[row+1], (row + 2) * ins);
else
printf("\n");
#endif
#ifdef TRN_SPEEDUP
if (low > MIN_DIST)
break;
#endif
} /* for row = 1 */
row = buffer[mod(index + radix - 1)];
if (buffer != store)
free((char *) buffer);
return row;
} /* edit_distn */
// $Id: engine.cpp 184 2013-07-10 05:24:26Z jessekornblum $
#include "main.h"
#include "ssdeep.h"
#include "match.h"
#define MAX_STATUS_MSG 78
bool display_result(state *s, const TCHAR * fn, const char * sum)
{
// Only spend the extra time to make a Filedata object if we need to
if (MODE(mode_match_pretty) or MODE(mode_match) or MODE(mode_directory))
{
Filedata * f;
try
{
f = new Filedata(fn, sum);
}
catch (std::bad_alloc)
{
fatal_error("%s: Unable to create Filedata object in engine.cpp:display_result()", __progname);
}
if (MODE(mode_match_pretty))
{
if (match_add(s,f))
print_error_unicode(s,fn,"Unable to add hash to set of known hashes");
}
else
{
// This block is for MODE(mode_match) or MODE(mode_directory)
match_compare(s,f);
if (MODE(mode_directory))
if (match_add(s,f))
print_error_unicode(s,
fn,
"Unable to add hash to set of known hashes");
}
}
else
{
// No special options selected. Display the hash for this file
if (s->first_file_processed)
{
print_status("%s", OUTPUT_FILE_HEADER);
s->first_file_processed = false;
}
printf ("%s,\"", sum);
display_filename(stdout,fn,TRUE);
print_status("\"");
}
return false;
}
int hash_file(state *s, TCHAR *fn)
{
size_t fn_length;
char *sum;
TCHAR *my_filename, *msg;
FILE *handle;
#ifdef WIN32
TCHAR expanded_fn[SSDEEP_PATH_MAX];
if (not expanded_path(fn)) {
_sntprintf(expanded_fn,
SSDEEP_PATH_MAX,
_TEXT("\\\\?\\%s"),
fn);
} else {
_tcsncpy(expanded_fn, fn, SSDEEP_PATH_MAX);
}
handle = _tfopen(expanded_fn, _TEXT("rb"));
# else
handle = fopen(fn, "rb");
#endif
if (NULL == handle)
{
print_error_unicode(s,fn,"%s", strerror(errno));
return TRUE;
}
if ((sum = (char *)malloc(sizeof(char) * FUZZY_MAX_RESULT)) == NULL)
{
fclose(handle);
print_error_unicode(s,fn,"%s", strerror(errno));
return TRUE;
}
if ((msg = (TCHAR *)malloc(sizeof(TCHAR) * (MAX_STATUS_MSG + 2))) == NULL)
{
free(sum);
fclose(handle);
print_error_unicode(s,fn,"%s", strerror(errno));
return TRUE;
}
if (MODE(mode_verbose))
{
fn_length = _tcslen(fn);
if (fn_length > MAX_STATUS_MSG)
{
// We have to make a duplicate of the string to call basename on it
// We need the original name for the output later on
my_filename = _tcsdup(fn);
my_basename(my_filename);
}
else
my_filename = fn;
_sntprintf(msg,
MAX_STATUS_MSG-1,
_TEXT("Hashing: %s%s"),
my_filename,
_TEXT(BLANK_LINE));
_ftprintf(stderr,_TEXT("%s\r"), msg);
if (fn_length > MAX_STATUS_MSG)
free(my_filename);
}
fuzzy_hash_file(handle,sum);
prepare_filename(s,fn);
display_result(s,fn,sum);
if (find_file_size(handle) > SSDEEP_MIN_FILE_SIZE)
s->found_meaningful_file = true;
s->processed_file = true;
fclose(handle);
free(sum);
free(msg);
return FALSE;
}
// SSDEEP
// $Id: filedata.cpp 163 2012-07-17 19:59:54Z jessekornblum $
// Copyright (C) 2012 Kyrus. See COPYING for details.
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include "filedata.h"
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
bool Filedata::valid(void) const
{
// A valid fuzzy hash has the form
// [blocksize]:[sig1]:[sig2]
// with no filename at the end
// First find the block size
const char * sig = m_signature.c_str();
unsigned int block_size;
if (-1 == sscanf(sig, "%u:", &block_size))
return false;
// Move past the blocksize
sig = strchr(sig,':');
if (!sig)
return false;
// Move past the first colon and Look for the second colon
++sig;
sig = strchr(sig,':');
if (!sig)
return false;
// Finally, a valid signature does *not* have a filename at the end of it
sig = strchr(sig,',');
if (sig)
return false;
return true;
}
void Filedata::clear_cluster(void)
{
if (NULL == m_cluster)
return;
// We don't want to call the destructors on the individual elements
// so we have to clear the set first.
m_cluster->clear();
m_cluster = NULL;
}
Filedata::Filedata(const TCHAR *fn, const char * sig, const char * match_file)
{
m_signature = std::string(sig);
if (not valid())
throw std::bad_alloc();
m_filename = _tcsdup(fn);
m_cluster = NULL;
if (NULL == match_file)
m_has_match_file = false;
else
{
m_has_match_file = true;
m_match_file = std::string(match_file);
}
}
Filedata::Filedata(const std::string sig, const char * match_file)
{
// Set the easy stuff first
m_cluster = NULL;
if (NULL == match_file)
m_has_match_file = false;
else
{
m_has_match_file = true;
m_match_file = std::string(match_file);
}
// If we don't have a filename included with the sig, that's ok,
// but we should find out now.
// If there is a filename, it should be immediately after the
// first comma and enclosed in quotation marks.
size_t start, stop;
start = sig.find_first_of(",\"");
if (std::string::npos == start)
{
// There is no filename. Ok. We still have a valid Filedata.
m_filename = _tcsdup(_TEXT("[NO FILENAME]"));
m_signature = std::string(sig);
// We still have to check the validity of the signature
if (not valid())
throw std::bad_alloc();
return;
}
// There is a filename. Ok.
// Advance past the comma and quotation mark.
start += 2;
// Look for the second quotation mark, which should be at the end
// of the string.
stop = sig.find_last_of('"');
if (stop != sig.size() - 1)
throw std::bad_alloc();
// Strip off the final quotation mark and record the filename
std::string tmp = sig.substr(start,(stop - start));
// Strip off the filename from the signature. Remember that "start"
// now points to two characters ahead of the comma
m_signature = sig.substr(0,start-2);
// Unescape any quotation marks in the filename
while (tmp.find(std::string("\\\"")) != std::string::npos)
tmp.replace(tmp.find(std::string("\\\"")),2,std::string("\""));
#ifndef _WIN32
m_filename = strdup(tmp.c_str());
#else
char * tmp2 = strdup(tmp.c_str());
// On Win32 we have to do a kludgy cast from ordinary char
// values to the TCHAR values we use internally. Because we may have
// reset the string length, get it again.
// The extra +1 is for the terminating newline
size_t i, sz = strlen(tmp2);
m_filename = (TCHAR *)malloc(sizeof(TCHAR) * (sz + 1));
if (NULL == m_filename)
throw std::bad_alloc();
for (i = 0 ; i < sz ; i++)
m_filename[i] = (TCHAR)(tmp2[i]);
m_filename[i] = 0;
#endif
}
std::ostream& operator<<(std::ostream& o, const Filedata& f)
{
return o << f.get_signature() << "," << f.get_filename() << ",";
}
bool operator==(const Filedata& a, const Filedata& b)
{
if (a.get_signature() != b.get_signature())
return false;
if (a.has_match_file() and not b.has_match_file())
return false;
if (not a.has_match_file() and b.has_match_file())
return false;
if (a.has_match_file() and b.has_match_file())
{
if (a.get_match_file() != b.get_match_file())
return false;
}
return true;
}
#ifndef __FILEDATA_H
#define __FILEDATA_H
/// @file filedata.h
// Copyright (C) 2012 Kyrus. See COPYING for details
// $Id: filedata.h 160 2012-07-17 01:00:07Z jessekornblum $
#include <set>
#include <string>
#include <iostream>
#include "tchar-local.h"
/// Contains a fuzzy hash and associated metadata for file
class Filedata
{
public:
Filedata() : m_has_match_file(false) {}
/// Creates a new Filedata object with the given filename and signature
///
/// If sig is not valid, throws std::bad_alloc
Filedata(const TCHAR * fn, const char * sig, const char * match_file = NULL);
/// Creates a new Filedata object with the given filename and signature
///
/// If sig is not valid, throws std::bad_alloc
Filedata(const std::string sig, const char * match_file = NULL);
/// Returns the file's fuzzy hash without a filename.
/// std::string("[blocksize]:[sig1]:[sig2]")
std::string get_signature(void) const { return m_signature; }
/// Returns the file's name
/// RBF - Should this be a std::wstring?
TCHAR * get_filename(void) const { return m_filename; }
/// Returns true if this file came from a file of known files on the disk
bool has_match_file(void) const { return m_has_match_file; }
/// Returns the name of the file on the disk from which this file came
/// RBF - Should this be a std::wstring?
std::string get_match_file(void) const { return m_match_file; }
/// Returns true if this file belongs to a cluster of similar files
bool has_cluster(void) const { return (m_cluster != NULL); }
void set_cluster(std::set<Filedata *> *c) { m_cluster = c; }
std::set<Filedata* >* get_cluster(void) const { return m_cluster; }
void clear_cluster(void);
private:
std::set<Filedata *> * m_cluster;
/// Original signature in the form [blocksize]:[sig1]:[sig2]
/// It may also contain the filename, but there is no guarantee of that
/// one way or the other.
std::string m_signature;
/// RBF - Should this be a std::wstring?
TCHAR * m_filename;
/// File of hashes where we got this known file from, if any
std::string m_match_file;
bool m_has_match_file;
/// Returns true if the m_signature field contains a valid fuzzy hash
bool valid(void) const;
};
/// Display [blocksize]:[sig1]:[sig2],"filename"
std::ostream& operator<<(std::ostream& o, const Filedata& f);
/// RBF - We can use this IF AND ONLY IF get_filename() returns a std::wstring
//bool operator==(const Filedata& a, const Filedata& b);
#endif // ifndef __FILEDATA_H
// Fuzzy Hashing by Jesse Kornblum
// Copyright (C) 2012 Kyrus
// Copyright (C) 2008 ManTech International Corporation
//
// $Id: find-file-size.c 144 2012-04-24 14:59:33Z jessekornblum $
//
#include "main.h"
#ifndef _WIN32
// Return the size, in bytes of an open file stream. On error, return 0
#if defined (__LINUX__)
off_t find_file_size(FILE *f)
{
off_t num_sectors = 0, sector_size = 0;
int fd = fileno(f);
struct stat sb;
if (fstat(fd,&sb))
return 0;
if (S_ISREG(sb.st_mode) || S_ISDIR(sb.st_mode))
return sb.st_size;
#ifdef HAVE_SYS_IOCTL_H
#ifdef HAVE_SYS_MOUNT_H
if (S_ISCHR(sb.st_mode) || S_ISBLK(sb.st_mode))
{
#if defined(_IO) && defined(BLKGETSIZE)
if (ioctl(fd, BLKGETSIZE, &num_sectors))
{
return 0;
}
#else
// If we can't run the ioctl call, we can't do anything here
return 0;
#endif // ifdefined _IO and BLKGETSIZE
#if defined(_IO) && defined(BLKSSZGET)
if (ioctl(fd, BLKSSZGET, &sector_size))
{
return 0;
}
if (0 == sector_size)
sector_size = 512;
#else
sector_size = 512;
#endif // ifdef _IO and BLKSSZGET
return (num_sectors * sector_size);
}
#endif // #ifdef HAVE_SYS_MOUNT_H
#endif // #ifdef HAVE_SYS_IOCTL_H
return 0;
}
#elif defined (__APPLE__)
off_t find_file_size(FILE *f) {
struct stat info;
off_t total = 0;
off_t original = ftello(f);
int fd = fileno(f);
uint32_t blocksize = 0;
uint64_t blockcount = 0;
// I'd prefer not to use fstat as it will follow symbolic links. We don't
// follow symbolic links. That being said, all symbolic links *should*
// have been caught before we got here.
if (fstat(fd, &info))
{
return 0;
}
#ifdef HAVE_SYS_IOCTL_H
// Block devices, like /dev/hda, don't return a normal filesize.
// If we are working with a block device, we have to ask the operating
// system to tell us the true size of the device.
//
// This isn't the recommended way to do check for block devices,
// but using S_ISBLK(info.stmode) wasn't working.
if (info.st_mode & S_IFBLK)
{
// Get the block size
if (ioctl(fd, DKIOCGETBLOCKSIZE,&blocksize) < 0)
{
return 0;
}
// Get the number of blocks
if (ioctl(fd, DKIOCGETBLOCKCOUNT, &blockcount) < 0)
{
}
total = blocksize * blockcount;
}
#endif // ifdef HAVE_IOCTL_H
else
{
if ((fseeko(f,0,SEEK_END)))
return 0;
total = ftello(f);
if ((fseeko(f,original,SEEK_SET)))
return 0;
}
return (total - original);
}
#else // ifdef __APPLE__
// This is code for general UNIX systems
// (e.g. NetBSD, FreeBSD, OpenBSD, etc)
static off_t
midpoint (off_t a, off_t b, long blksize)
{
off_t aprime = a / blksize;
off_t bprime = b / blksize;
off_t c, cprime;
cprime = (bprime - aprime) / 2 + aprime;
c = cprime * blksize;
return c;
}
off_t find_dev_size(int fd, int blk_size)
{
off_t curr = 0, amount = 0;
void *buf;
if (blk_size == 0)
return 0;
buf = malloc(blk_size);
for (;;) {
ssize_t nread;
lseek(fd, curr, SEEK_SET);
nread = read(fd, buf, blk_size);
if (nread < blk_size)
{
if (nread <= 0)
{
if (curr == amount)
{
free(buf);
lseek(fd, 0, SEEK_SET);
return amount;
}
curr = midpoint(amount, curr, blk_size);
}
else
{ // 0 < nread < blk_size
free(buf);
lseek(fd, 0, SEEK_SET);
return amount + nread;
}
}
else
{
amount = curr + blk_size;
curr = amount * 2;
}
}
free(buf);
lseek(fd, 0, SEEK_SET);
return amount;
}
off_t find_file_size(FILE *f)
{
int fd = fileno(f);
struct stat sb;
if (fstat(fd,&sb))
return 0;
if (S_ISREG(sb.st_mode) || S_ISDIR(sb.st_mode))
return sb.st_size;
else if (S_ISCHR(sb.st_mode) || S_ISBLK(sb.st_mode))
return find_dev_size(fd,sb.st_blksize);
return 0;
}
#endif // ifdef __LINUX__
#endif // ifndef _WIN32
#if defined(_WIN32)
off_t find_file_size(FILE *f)
{
off_t total = 0, original = ftello(f);
// Windows does not support running fstat on block devices,
// so there's no point in mucking about with them.
if ((fseeko(f,0,SEEK_END)))
return 0;
total = ftello(f);
if ((fseeko(f,original,SEEK_SET)))
return 0;
return total;
}
#endif // ifdef _WIN32
/*
* Copyright (C) ManTech International Corporation 2010
* Copyright (C) Kyrus 2012
* Copyright (C) 2013 Helmut Grohne <helmut@subdivi.de>
*
* $Id: fuzzy.h 180 2013-06-10 23:24:26Z jessekornblum $
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
* Earlier versions of this code can be found at:
* http://ssdeep.sf.net/
*/
/// \mainpage
/// This is the documentation for the fuzzy hashing API from ssdeep.
///
/// There is a complete function reference in fuzzy.h.
///
/// The most recent version of this documentation can be found
/// at http://ssdeep.sourceforge.net/.
///
/// \copydoc fuzzy.h
///
/// \version 3.0
///
/// \author Jesse Kornblum, research@jessekornblum.com
/// \author Helmut Grohne, helmut@subdivi.de
/// \file fuzzy.h
/// \brief
/// These functions allow a programmer to compute the fuzzy hashes
/// (also called the context-triggered piecewise hashes) of
/// \link fuzzy_hash_buf() a buffer
/// of text @endlink,
/// \link fuzzy_hash_filename() the contents of a file on the disk @endlink,
/// and
/// @link fuzzy_hash_file() the contents of
/// an open file handle @endlink .
/// There is also a function to
/// @link fuzzy_compare() compute the
/// similarity between any two fuzzy signatures @endlink.
#include <stdint.h>
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifndef FUZZY_H
#define FUZZY_H
/**
* @brief fuzzy_digest flag indicating to eliminate sequences of more than
* three identical characters
*/
#define FUZZY_FLAG_ELIMSEQ 0x1u
/**
* @brief fuzzy_digest flag indicating not to truncate the second part to
* SPAMSUM_LENGTH/2 characters.
*/
#define FUZZY_FLAG_NOTRUNC 0x2u
struct fuzzy_state;
/**
* @brief Construct a fuzzy_state object and return it.
*
* To use it call fuzzy_update and fuzzy_digest on it. It must be disposed
* with fuzzy_free.
* @return the constructed fuzzy_state or NULL on failure
*/
extern /*@only@*/ /*@null@*/ struct fuzzy_state *fuzzy_new(void);
/**
* @brief Feed the data contained in the given buffer to the state.
*
* When an error occurs, the state is undefined. In that case it must not be
* passed to any function besides fuzzy_free.
* @param buffer The data to be hashes
* @param buffer_size The length of the given buffer
* @return zero on success, non-zero on error
*/
extern int fuzzy_update(struct fuzzy_state *state,
const unsigned char *buffer,
size_t buffer_size);
/**
* @brief Obtain the fuzzy hash from the state.
*
* This operation does not change the state at all. It reports the hash for the
* concatenation of the data previously fed using fuzzy_update.
* @param result Where the fuzzy hash is stored. This variable
* must be allocated to hold at least FUZZY_MAX_RESULT bytes.
* @param flags is a bitwise or of FUZZY_FLAG_* macros. The absence of flags is
* represented by a zero.
* @return zero on success, non-zero on error
*/
extern int fuzzy_digest(const struct fuzzy_state *state,
/*@out@*/ char *result,
unsigned int flags);
/**
* @brief Dispose a fuzzy state.
*/
extern void fuzzy_free(/*@only@*/ struct fuzzy_state *state);
/**
* @brief Compute the fuzzy hash of a buffer
*
* The computes the fuzzy hash of the first buf_len bytes of the buffer.
* It is the caller's responsibility to append the filename,
* if any, to result after computation.
* @param buf The data to be fuzzy hashed
* @param buf_len The length of the data being hashed
* @param result Where the fuzzy hash of buf is stored. This variable
* must be allocated to hold at least FUZZY_MAX_RESULT bytes.
* @return Returns zero on success, non-zero on error.
*/
extern int fuzzy_hash_buf(const unsigned char *buf,
uint32_t buf_len,
/*@out@*/ char *result);
/**
* @brief Compute the fuzzy hash of a file using an open handle
*
* Computes the fuzzy hash of the contents of the open file, starting
* at the beginning of the file. When finished, the file pointer is
* returned to its original position. If an error occurs, the file
* pointer's value is undefined.
* It is the callers's responsibility to append the filename
* to the result after computation.
* @param handle Open handle to the file to be hashed
* @param result Where the fuzzy hash of the file is stored. This
* variable must be allocated to hold at least FUZZY_MAX_RESULT bytes.
* @return Returns zero on success, non-zero on error
*/
extern int fuzzy_hash_file(FILE *handle, /*@out@*/ char *result);
/**
* @brief Compute the fuzzy hash of a stream using an open handle
*
* Computes the fuzzy hash of the contents of the open stream, starting at the
* current file position until reaching EOF. Unlike fuzzy_hash_file the stream
* is never seeked. If an error occurs, the result as well as the file position
* are undefined.
* It is the callers's responsibility to append the filename
* to the result after computation.
* @param handle Open handle to the stream to be hashed
* @param result Where the fuzzy hash of the file is stored. This
* variable must be allocated to hold at least FUZZY_MAX_RESULT bytes.
* @return Returns zero on success, non-zero on error
*/
extern int fuzzy_hash_stream(FILE *handle, /*@out@*/ char *result);
/**
* @brief Compute the fuzzy hash of a file
*
* Opens, reads, and hashes the contents of the file 'filename'
* The result must be allocated to hold FUZZY_MAX_RESULT characters.
* It is the caller's responsibility to append the filename
* to the result after computation.
* @param filename The file to be hashed
* @param result Where the fuzzy hash of the file is stored. This
* variable must be allocated to hold at least FUZZY_MAX_RESULT bytes.
* @return Returns zero on success, non-zero on error.
*/
extern int fuzzy_hash_filename(const char *filename, /*@out@*/ char * result);
/// Computes the match score between two fuzzy hash signatures.
/// @return Returns a value from zero to 100 indicating the
/// match score of the
/// two signatures. A match score of zero indicates the sigantures
/// did not match. When an error occurs, such as if one of the
/// inputs is NULL, returns -1.
extern int fuzzy_compare(const char *sig1, const char *sig2);
/** Length of an individual fuzzy hash signature component. */
#define SPAMSUM_LENGTH 64
/** The longest possible length for a fuzzy hash signature
* (without the filename) */
#define FUZZY_MAX_RESULT (2 * SPAMSUM_LENGTH + 20)
#ifdef __cplusplus
}
#endif
#endif
// ssdeep
// Copyright (C) 2012 Kyrus
// Copyright (C) 2006 ManTech International Corporation
//
// $Id: helpers.cpp 184 2013-07-10 05:24:26Z jessekornblum $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#include "ssdeep.h"
void try_msg(void)
{
fprintf (stderr,"Try `%s -h` for more information.%s", __progname, NEWLINE);
}
bool expanded_path(TCHAR *p)
{
if (_tcsncmp(p,_TEXT("\\\\?\\"),4))
return false;
return true;
}
void sanity_check(state *s, int condition, const char *msg)
{
if (NULL == s)
exit(EXIT_FAILURE);
if (condition)
{
if (!(s->mode & mode_silent))
{
print_status("%s: %s", __progname, msg);
try_msg();
}
exit (EXIT_FAILURE);
}
}
// The basename function kept misbehaving on OS X, so I rewrote it.
// This function isn't perfect, nor is it designed to be. Because
// we're guaranteed to be working with a filename here, there's no way
// that s will end with a DIR_SEPARATOR (e.g. /foo/bar/). This function
// will not work properly for a string that ends in a DIR_SEPARATOR */
int my_basename(TCHAR *s)
{
size_t len;
TCHAR * tmp;
if (NULL == s)
return TRUE;
tmp = _tcsrchr(s,DIR_SEPARATOR);
if (NULL == tmp)
return FALSE;
len = _tcslen(tmp);
// We advance tmp one character to move us past the DIR_SEPARATOR
_tmemmove(s,tmp+1,len);
return FALSE;
}
int my_dirname(TCHAR *c)
{
TCHAR *tmp;
if (NULL == c)
return TRUE;
// If there are no DIR_SEPARATORs in the directory name, then the
// directory name should be the empty string
tmp = _tcsrchr(c,DIR_SEPARATOR);
if (NULL != tmp)
tmp[1] = 0;
else
c[0] = 0;
return FALSE;
}
void prepare_filename(state *s, TCHAR *fn)
{
if (s->mode & mode_barename)
{
if (my_basename(fn))
{
print_error_unicode(s,fn,"Unable to shorten filename");
return;
}
}
}
// Remove the newlines, if any. Works on both DOS and *nix newlines
void chop_line_tchar(TCHAR *s)
{
size_t pos = _tcslen(s);
while (pos > 0)
{
// We split up the two checks because we can never know which
// condition the computer will examine if first. If pos == 0, we
// don't want to be checking s[pos-1] under any circumstances!
if (!(s[pos-1] == _TEXT('\r') || s[pos-1] == _TEXT('\n')))
return;
s[pos-1] = 0;
--pos;
}
}
// Remove the newlines, if any. Works on both DOS and *nix newlines
void chop_line(char *s)
{
size_t pos = strlen(s);
while (pos > 0)
{
// We split up the two checks because we can never know which
// condition the computer will examine if first. If pos == 0, we
// don't want to be checking s[pos-1] under any circumstances!
if (!(s[pos-1] == _TEXT('\r') || s[pos-1] == _TEXT('\n')))
return;
s[pos-1] = 0;
--pos;
}
}
// Shift the contents of a string so that the values after 'new_start'
// will now begin at location 'start'
void shift_string_tchar(TCHAR *fn, unsigned int start, unsigned int new_start)
{
size_t sz = _tcslen(fn);
if (start > sz || new_start < start)
return;
while (new_start < sz)
{
fn[start] = fn[new_start];
new_start++;
start++;
}
fn[start] = 0;
}
// Find the index of the next comma in the string s starting at index start.
// If there is no next comma, returns -1.
int find_next_comma_tchar(TCHAR *s, unsigned int start)
{
size_t size = _tcslen(s);
unsigned int pos = start;
int in_quote = FALSE;
while (pos < size)
{
switch (s[pos]) {
case _TEXT('"'):
in_quote = !in_quote;
break;
case _TEXT(','):
if (in_quote)
break;
// Although it's potentially unwise to cast an unsigned int back
// to an int, problems will only occur when the value is beyond
// the range of int. Because we're working with the index of a
// string that is probably less than 32,000 characters, we should
// be okay.
return (int)pos;
}
++pos;
}
return -1;
}
void mm_magic(void){MM_INIT("%s\n","\x49\x20\x64\x6f\x20\x6e\x6f\x74\x20\x62\x65\x6c\x69\x65\x76\x65\x20\x77\x65\x20\x77\x69\x6c\x6c\x20\x67\x65\x74\x20\x45\x64\x64\x69\x65\x20\x56\x61\x6e\x20\x48\x61\x6c\x65\x6e\x20\x75\x6e\x74\x69\x6c\x20\x77\x65\x20\x68\x61\x76\x65\x20\x61\x20\x74\x72\x69\x75\x6d\x70\x68\x61\x6e\x74\x20\x76\x69\x64\x65\x6f\x2e");}
// Returns the string after the nth comma in the string s. If that
// string is quoted, the quotes are removed. If there is no valid
// string to be found, returns TRUE. Otherwise, returns FALSE
int find_comma_separated_string_tchar(TCHAR *s, unsigned int n)
{
int start = 0, end;
unsigned int count = 0;
while (count < n)
{
if ((start = find_next_comma_tchar(s,start)) == -1)
return TRUE;
++count;
// Advance the pointer past the current comma
++start;
}
// It's okay if there is no next comma, it just means that this is
// the last comma separated value in the string
if ((end = find_next_comma_tchar(s,start)) == -1)
end = _tcslen(s);
// Strip off the quotation marks, if necessary. We don't have to worry
// about uneven quotation marks (i.e quotes at the start but not the end
// as they are handled by the the find_next_comma function.
if (s[start] == _TEXT('"'))
++start;
if (s[end - 1] == _TEXT('"'))
end--;
s[end] = 0;
shift_string_tchar(s,0,start);
return FALSE;
}
// Shift the contents of a string so that the values after 'new_start'
// will now begin at location 'start'
void shift_string(char *fn, size_t start, size_t new_start)
{
// TODO: Can shift_string be replaced with memmove?
if (start > strlen(fn) || new_start < start)
return;
while (new_start < strlen(fn))
{
fn[start] = fn[new_start];
new_start++;
start++;
}
fn[start] = 0;
}
// Find the index of the next comma in the string s starting at index start.
// If there is no next comma, returns -1
int find_next_comma(char *s, unsigned int start)
{
size_t size=strlen(s);
unsigned int pos = start;
int in_quote = FALSE;
while (pos < size)
{
switch (s[pos]) {
case '"':
in_quote = !in_quote;
break;
case ',':
if (in_quote)
break;
// Although it's potentially unwise to cast an unsigned int back
// to an int, problems will only occur when the value is beyond
// the range of int. Because we're working with the index of a
// string that is probably less than 32,000 characters, we should
// be okay.
return (int)pos;
}
++pos;
}
return -1;
}
/// Returns the string after the nth comma in the string s. If that
/// string is quoted, the quotes are removed. If there is no valid
/// string to be found, returns TRUE. Otherwise, returns FALSE
int find_comma_separated_string(char *s, unsigned int n)
{
int start = 0, end;
unsigned int count = 0;
while (count < n)
{
if ((start = find_next_comma(s,start)) == -1)
return TRUE;
++count;
// Advance the pointer past the current comma
++start;
}
// It's okay if there is no next comma, it just means that this is
// the last comma separated value in the string
if ((end = find_next_comma(s,start)) == -1)
end = strlen(s);
// Strip off the quotation marks, if necessary. We don't have to worry
// about uneven quotation marks (i.e quotes at the start but not the end
// as they are handled by the the find_next_comma function.
if (s[start] == '"')
++start;
if (s[end - 1] == '"')
end--;
s[end] = 0;
shift_string(s,0,start);
return FALSE;
}
int remove_escaped_quotes(char * str)
{
if (NULL == str)
return TRUE;
size_t pos = 0;
while (str[pos] != 0)
{
if ('\\' == str[pos] && '"' == str[pos+1])
shift_string(str,pos,pos+1);
++pos;
}
return FALSE;
}
This source diff could not be displayed because it is too large. You can view the blob instead.
// Fuzzy Hashing by Jesse Kornblum
// Copyright (C) 2013 Facebook
// Copyright (C) 2012 Kyrus
// Copyright (C) 2010 ManTech International Corporation
//
// $Id: main.cpp 187 2013-07-10 06:56:14Z jessekornblum $
//
// This program is licensed under version 2 of the GNU Public License.
// See the file COPYING for details.
#include "ssdeep.h"
#include "match.h"
#ifdef _WIN32
// This can't go in main.h or we get multiple definitions of it
// Allows us to open standard input in binary mode by default
// See http://gnuwin32.sourceforge.net/compile.html for more
int _CRT_fmode = _O_BINARY;
#endif
static bool initialize_state(state *s)
{
if (NULL == s)
return true;
s->mode = mode_none;
s->first_file_processed = true;
s->found_meaningful_file = false;
s->processed_file = false;
s->threshold = 0;
return false;
}
// In order to fit on one Win32 screen this function should produce
// no more than 22 lines of output.
static void usage(void)
{
print_status ("%s version %s by Jesse Kornblum", __progname, VERSION);
print_status ("Copyright (C) 2013 Facebook");
print_status ("");
print_status ("Usage: %s [-m file] [-k file] [-dpgvrsblcxa] [-t val] [-h|-V] [FILES]",
__progname);
print_status ("-m - Match FILES against known hashes in file");
print_status ("-k - Match signatures in FILES against signatures in file");
print_status ("-d - Directory mode, compare all files in a directory");
print_status ("-p - Pretty matching mode. Similar to -d but includes all matches");
print_status ("-g - Cluster matches together");
print_status ("-v - Verbose mode. Displays filename as its being processed");
print_status ("-r - Recursive mode");
print_status ("-s - Silent mode; all errors are supressed");
print_status ("-b - Uses only the bare name of files; all path information omitted");
print_status ("-l - Uses relative paths for filenames");
print_status ("-c - Prints output in CSV format");
print_status ("-x - Compare FILES as signature files");
print_status ("-a - Display all matches, regardless of score");
print_status ("-t - Only displays matches above the given threshold");
print_status ("-h - Display this help message");
print_status ("-V - Display version number and exit");
}
static void process_cmd_line(state *s, int argc, char **argv)
{
int i, match_files_loaded = FALSE;
while ((i=getopt(argc,argv,"gavhVpdsblcxt:rm:k:")) != -1) {
switch(i) {
case 'g':
s->mode |= mode_cluster;
break;
case 'a':
s->mode |= mode_display_all;
break;
case 'v':
if (MODE(mode_verbose))
{
print_error(s,"%s: Already at maximum verbosity", __progname);
print_error(s,
"%s: Error message displayed to user correctly",
__progname);
}
else
s->mode |= mode_verbose;
break;
case 'p':
s->mode |= mode_match_pretty;
break;
case 'd':
s->mode |= mode_directory;
break;
case 's':
s->mode |= mode_silent; break;
case 'b':
s->mode |= mode_barename; break;
case 'l':
s->mode |= mode_relative; break;
case 'c':
s->mode |= mode_csv; break;
case 'x':
s->mode |= mode_sigcompare; break;
case 'r':
s->mode |= mode_recursive; break;
case 't':
s->threshold = (uint8_t)atol(optarg);
if (s->threshold > 100)
fatal_error("%s: Illegal threshold", __progname);
s->mode |= mode_threshold;
break;
case 'm':
if (MODE(mode_compare_unknown) || MODE(mode_sigcompare))
fatal_error("Positive matching cannot be combined with other matching modes");
s->mode |= mode_match;
if (not match_load(s,optarg))
match_files_loaded = TRUE;
break;
case 'k':
if (MODE(mode_match) || MODE(mode_sigcompare))
fatal_error("Signature matching cannot be combined with other matching modes");
s->mode |= mode_compare_unknown;
if (not match_load(s,optarg))
match_files_loaded = TRUE;
break;
case 'h':
usage();
exit (EXIT_SUCCESS);
case 'V':
print_status ("%s", VERSION);
exit (EXIT_SUCCESS);
default:
try_msg();
exit (EXIT_FAILURE);
}
}
// We don't include mode_sigcompare in this list as we haven't loaded
// the matching files yet. In that mode the matching files are in fact
// the command line arguments.
sanity_check(s,
((MODE(mode_match) || MODE(mode_compare_unknown))
&& not match_files_loaded),
"No matching files loaded");
sanity_check(s,
((s->mode & mode_barename) && (s->mode & mode_relative)),
"Relative paths and bare names are mutually exclusive");
sanity_check(s,
((s->mode & mode_match_pretty) && (s->mode & mode_directory)),
"Directory mode and pretty matching are mutually exclusive");
sanity_check(s,
MODE(mode_csv) and MODE(mode_cluster),
"CSV and clustering modes cannot be combined");
// -m, -p, and -d are incompatible with -k and -x
// The former treat FILES as raw files. The latter require them to be sigs
sanity_check(s,
((MODE(mode_match) or MODE(mode_match_pretty) or MODE(mode_directory))
and
(MODE(mode_compare_unknown) or MODE(mode_sigcompare))),
"Incompatible matching modes");
}
#ifdef _WIN32
static int prepare_windows_command_line(state *s)
{
int argc;
TCHAR **argv;
argv = CommandLineToArgvW(GetCommandLineW(),&argc);
s->argc = argc;
s->argv = argv;
return FALSE;
}
#endif
static int is_absolute_path(TCHAR *fn)
{
if (NULL == fn)
internal_error("Unknown error in is_absolute_path");
#ifdef _WIN32
return (isalpha(fn[0]) and _TEXT(':') == fn[1]);
# else
return (DIR_SEPARATOR == fn[0]);
#endif
}
static void generate_filename(state *s, TCHAR *fn, TCHAR *cwd, TCHAR *input)
{
if (NULL == fn || NULL == input)
internal_error("Error calling generate_filename");
if ((s->mode & mode_relative) || is_absolute_path(input))
_tcsncpy(fn, input, SSDEEP_PATH_MAX);
else {
// Windows systems don't have symbolic links, so we don't
// have to worry about carefully preserving the paths
// they follow. Just use the system command to resolve the paths
#ifdef _WIN32
_wfullpath(fn, input, SSDEEP_PATH_MAX);
#else
if (NULL == cwd)
// If we can't get the current working directory, we're not
// going to be able to build the relative path to this file anyway.
// So we just call realpath and make the best of things
realpath(input, fn);
else
snprintf(fn, SSDEEP_PATH_MAX, "%s%c%s", cwd, DIR_SEPARATOR, input);
#endif
}
}
int main(int argc, char **argv)
{
int count, status, goal = argc;
state *s;
TCHAR *fn, *cwd;
#ifndef __GLIBC__
// __progname = basename(argv[0]);
#endif
s = new state;
if (initialize_state(s))
fatal_error("%s: Unable to initialize state variable", __progname);
process_cmd_line(s,argc,argv);
#ifdef _WIN32
if (prepare_windows_command_line(s))
fatal_error("%s: Unable to process command line arguments", __progname);
#else
s->argc = argc;
s->argv = argv;
#endif
// Anything left on the command line at this point is a file
// or directory we're supposed to process. If there's nothing
// specified, we should tackle standard input
if (optind == argc) {
status = process_stdin(s);
}
else {
MD5DEEP_ALLOC(TCHAR, fn, SSDEEP_PATH_MAX);
MD5DEEP_ALLOC(TCHAR, cwd, SSDEEP_PATH_MAX);
cwd = _tgetcwd(cwd, SSDEEP_PATH_MAX);
if (NULL == cwd)
fatal_error("%s: %s", __progname, strerror(errno));
count = optind;
// The signature comparsion mode needs to use the command line
// arguments and argument count. We don't do wildcard expansion
// on it on Win32 (i.e. where it matters). The setting of 'goal'
// to the original argc occured at the start of main(), so we just
// need to update it if we're *not* in signature compare mode.
if (not (s->mode & mode_sigcompare)) {
goal = s->argc;
}
while (count < goal)
{
if (MODE(mode_sigcompare))
match_load(s,argv[count]);
else if (MODE(mode_compare_unknown))
match_compare_unknown(s,argv[count]);
else {
generate_filename(s, fn, cwd, s->argv[count]);
#ifdef _WIN32
status = process_win32(s, fn);
#else
status = process_normal(s, fn);
#endif
}
++count;
}
// If we processed files, but didn't find anything large enough
// to be meaningful, we should display a warning message to the user.
// This happens mostly when people are testing very small files
// e.g. $ echo "hello world" > foo && ssdeep foo
if ((not s->found_meaningful_file) and s->processed_file)
{
print_error(s,"%s: Did not process files large enough to produce meaningful results", __progname);
}
}
// If the user has requested us to compare signature files, use
// our existng code to pretty-print directory matching to do the
// work for us.
if (MODE(mode_sigcompare))
s->mode |= mode_match_pretty;
if (MODE(mode_match_pretty) or MODE(mode_sigcompare) or MODE(mode_cluster))
find_matches_in_known(s);
if (MODE(mode_cluster))
display_clusters(s);
return (EXIT_SUCCESS);
}
// ssdeep
// Copyright (C) 2012 Kyrus
//
// $Id: main.h 144 2012-04-24 14:59:33Z jessekornblum $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#ifndef __MAIN_H
#define __MAIN_H
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <limits.h>
#include <sys/stat.h>
#include <unistd.h>
#include <ctype.h>
#include <inttypes.h>
#ifdef HAVE_DIRENT_H
# include <dirent.h>
#endif
#ifdef TIME_WITH_SYS_TIME
# include <sys/time.h>
# include <time.h>
#else
# ifdef HAVE_SYS_TIME_H
# include <sys/time.h>
# else
# include <time.h>
# endif
#endif
#ifdef HAVE_SYS_TYPES_H
# include <sys/types.h>
#endif
#ifdef HAVE_SYS_PARAM_H
# include <sys/param.h>
#endif
#ifdef HAVE_SYS_STAT_H
# include <sys/stat.h>
#endif
#ifdef HAVE_SYS_IOCTL_H
# include <sys/ioctl.h>
#endif
#ifdef HAVE_SYS_MOUNT_H
# include <sys/mount.h>
#endif
#ifdef HAVE_SYS_DISK_H
# include <sys/disk.h>
#endif
#ifdef HAVE_LIBGEN_H
# include <libgen.h>
#endif
// This allows us to open standard input in binary mode by default
// See http://gnuwin32.sourceforge.net/compile.html for more.
// Technically it isn't needed in ssdeep as we don't process standard
// input. But it was part of Jesse's template, so in it goes!
#ifdef HAVE_FCNTL_H
# include <fcntl.h>
#endif
#ifndef HAVE_FSEEKO
# define fseeko fseek
# define ftello ftell
#endif
#define FALSE 0
#define TRUE 1
#ifndef MIN
#define MIN(a,b) ((a)<(b)?(a):(b))
#endif
#ifndef MAX
#define MAX(a,b) ((a)>(b)?(a):(b))
#endif
#endif // #ifndef __MAIN_H
// ssdeep
// (C) Copyright 2012 Kyrus
//
// $Id: match.cpp 164 2012-07-23 16:12:36Z jessekornblum $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#include "match.h"
// The longest line we should encounter when reading files of known hashes
#define MAX_STR_LEN 2048
#define MIN_SUBSTR_LEN 7
// ------------------------------------------------------------------
// SIGNATURE FILE FUNCTIONS
// ------------------------------------------------------------------
/// Open a file of known hashes and determine if it's valid
///
/// @param s State variable
/// @param fn filename to open
///
/// @return Returns false success, true on error
bool sig_file_open(state *s, const char * fn)
{
if (NULL == s or NULL == fn)
return true;
s->known_handle = fopen(fn,"rb");
if (NULL == s->known_handle)
{
if ( ! (MODE(mode_silent)) )
perror(fn);
return true;
}
// The first line of the file should contain a valid ssdeep header.
char buffer[MAX_STR_LEN];
if (NULL == fgets(buffer,MAX_STR_LEN,s->known_handle))
{
if ( ! (MODE(mode_silent)) )
perror(fn);
fclose(s->known_handle);
return true;
}
chop_line(buffer);
if (strncmp(buffer,SSDEEPV1_0_HEADER,MAX_STR_LEN) and
strncmp(buffer,SSDEEPV1_1_HEADER,MAX_STR_LEN))
{
if ( ! (MODE(mode_silent)) )
print_error(s,"%s: Invalid file header.", fn);
fclose(s->known_handle);
return true;
}
// We've now read the first line
s->line_number = 1;
s->known_fn = strdup(fn);
return false;
}
/// @brief Read the next entry in a file of known hashes and convert
/// it to a Filedata
///
/// @param s State variable
/// @param f Structure where to store the data we read
///
/// @return Returns true if there is no entry to read or on error.
/// Otherwise, false.
bool sig_file_next(state *s, Filedata ** f)
{
if (NULL == s or NULL == f or NULL == s->known_handle)
return true;
char buffer[MAX_STR_LEN];
memset(buffer,0,MAX_STR_LEN);
if (NULL == fgets(buffer,MAX_STR_LEN,s->known_handle))
return true;
s->line_number++;
chop_line(buffer);
try
{
*f = new Filedata(std::string(buffer),s->known_fn);
}
catch (std::bad_alloc)
{
// This can happen on a badly formatted line, or a blank one.
// We don't display errors on blank lines.
if (strlen(buffer) > 0)
print_error(s,
"%s: Bad hash in line %llu",
s->known_fn,
s->line_number);
return true;
}
return false;
}
bool sig_file_close(state *s)
{
if (NULL == s)
return true;
free(s->known_fn);
if (s->known_handle != NULL)
return true;
if (fclose(s->known_handle))
return true;
return false;
}
bool sig_file_end(state *s)
{
return (feof(s->known_handle));
}
// ------------------------------------------------------------------
// MATCHING FUNCTIONS
// ------------------------------------------------------------------
void display_clusters(const state *s)
{
if (NULL == s)
return;
std::set<std::set<Filedata *> *>::const_iterator it;
for (it = s->all_clusters.begin(); it != s->all_clusters.end() ; ++it)
{
print_status("** Cluster size %u", (*it)->size());
std::set<Filedata *>::const_iterator cit;
for (cit = (*it)->begin() ; cit != (*it)->end() ; ++cit)
{
display_filename(stdout,(*cit)->get_filename(),FALSE);
print_status("");
}
print_status("");
}
}
void cluster_add(Filedata * dest, Filedata * src)
{
dest->get_cluster()->insert(src);
src->set_cluster(dest->get_cluster());
}
void cluster_join(state *s, Filedata * a, Filedata * b)
{
// If these items are already in the same cluster there is nothing to do
if (a->get_cluster() == b->get_cluster())
return;
Filedata * dest, * src;
// Combine the smaller cluster into the larger cluster for speed
// (fewer items to move)
if (a->get_cluster()->size() > b->get_cluster()->size())
{
dest = a;
src = b;
}
else
{
dest = b;
src = a;
}
// Add members of src to dest
std::set<Filedata *>::const_iterator it;
for (it = src->get_cluster()->begin() ;
it != src->get_cluster()->end() ;
++it)
{
dest->get_cluster()->insert(*it);
}
// Remove the old cluster
s->all_clusters.erase(src->get_cluster());
// This call sets the cluster to NULL. Do not access the src
// cluster after this call!
src->clear_cluster();
src->set_cluster(dest->get_cluster());
}
void handle_clustering(state *s, Filedata *a, Filedata *b)
{
bool a_has = a->has_cluster(), b_has = b->has_cluster();
// In the easiest case, one of these has a cluster and one doesn't
if (a_has and not b_has)
{
cluster_add(a,b);
return;
}
if (b_has and not a_has)
{
cluster_add(b,a);
return;
}
// Combine existing clusters
if (a_has and b_has)
{
cluster_join(s,a,b);
return;
}
// Create a new cluster
std::set<Filedata *> * cluster = new std::set<Filedata *>();
cluster->insert(a);
cluster->insert(b);
s->all_clusters.insert(cluster);
a->set_cluster(cluster);
b->set_cluster(cluster);
}
void handle_match(state *s,
Filedata *a,
Filedata *b,
int score)
{
if (s->mode & mode_csv)
{
printf("\"");
display_filename(stdout,a->get_filename(),TRUE);
printf("\",\"");
display_filename(stdout,b->get_filename(),TRUE);
print_status("\",%u", score);
}
else if (s->mode & mode_cluster)
{
handle_clustering(s,a,b);
}
else
{
// The match file names may be empty. If so, we don't print them
// or the colon which separates them from the filename
if (a->has_match_file())
printf ("%s:", a->get_match_file().c_str());
display_filename(stdout,a->get_filename(),FALSE);
printf (" matches ");
if (b->has_match_file())
printf ("%s:", b->get_match_file().c_str());
display_filename(stdout,b->get_filename(),FALSE);
print_status(" (%u)", score);
}
}
bool match_compare(state *s, Filedata * f)
{
if (NULL == s)
fatal_error("%s: Null state passed into match_compare", __progname);
bool status = false;
size_t fn_len = _tcslen(f->get_filename());
std::vector<Filedata* >::const_iterator it;
for (it = s->all_files.begin() ; it != s->all_files.end() ; ++it)
{
// When in pretty mode, we still want to avoid printing
// A matches A (100).
if (s->mode & mode_match_pretty)
{
if (!(_tcsncmp(f->get_filename(),
(*it)->get_filename(),
std::max(fn_len,_tcslen((*it)->get_filename())))) and
(f->get_signature() == (*it)->get_signature()))
{
// Unless these results from different matching files (such as
// what happens in sigcompare mode). That being said, we have to
// be careful to avoid NULL values such as when working in
// normal pretty print mode.
if (not(f->has_match_file()) or
f->get_match_file() == (*it)->get_match_file())
continue;
}
}
int score = fuzzy_compare(f->get_signature().c_str(),
(*it)->get_signature().c_str());
if (-1 == score)
print_error(s, "%s: Bad hashes in comparison", __progname);
else
{
if (score > s->threshold or MODE(mode_display_all))
{
handle_match(s,f,(*it),score);
status = true;
}
}
}
return status;
}
bool find_matches_in_known(state *s)
{
if (NULL == s)
return true;
// Walk the vector which contains all of the known files
std::vector<Filedata *>::const_iterator it;
for (it = s->all_files.begin() ; it != s->all_files.end() ; ++it)
{
bool status = match_compare(s,*it);
// In pretty mode and sigcompare mode we need to display a blank
// line after each file. In clustering mode we don't display anything
// right now.
if (status and not(MODE(mode_cluster)))
print_status("");
}
return false;
}
bool match_add(state *s, Filedata * f)
{
if (NULL == s)
return true;
s->all_files.push_back(f);
return false;
}
bool match_load(state *s, const char *fn)
{
if (NULL == s or NULL == fn)
return true;
if (sig_file_open(s,fn))
return true;
bool status;
do
{
Filedata * f;
status = sig_file_next(s,&f);
if (not status)
{
if (match_add(s,f))
{
// One bad hash doesn't mean this load was a failure.
// We don't change the return status because match_add failed.
print_error(s,"%s: unable to insert hash", fn);
break;
}
}
} while (not sig_file_end(s));
sig_file_close(s);
return false;
}
bool match_compare_unknown(state *s, const char * fn)
{
if (NULL == s or NULL == fn)
return true;
if (sig_file_open(s,fn))
return true;
bool status;
do
{
Filedata *f;
status = sig_file_next(s,&f);
if (not status)
match_compare(s,f);
} while (not sig_file_end(s));
sig_file_close(s);
return false;
}
#ifndef __MATCH_H
#define __MATCH_H
// SSDEEP
// $Id$
// Copyright (C) 2012 Kyrus.
#include "ssdeep.h"
#include "filedata.h"
// *********************************************************************
// Matching functions
// *********************************************************************
/// @brief Match the file f against the set of knowns
///
/// @return Returns false if there are no matches, true if at least one match
/// @param s State variable
/// @param f Filedata structure for the file.
bool match_compare(state *s, Filedata * f);
/// @brief Load a file of known hashes
///
/// @return Returns false on success, true on error
bool match_load(state *s, const char *fn);
/// @brief Add a single new hash to the set of known hashes
///
/// @return Returns false on success, true on error
bool match_add(state *s, Filedata * f);
/// Find and display all matches in the set of known hashes
bool find_matches_in_known(state *s);
/// Load the known hashes from the file fn and compare them to the
/// set of known hashes
bool match_compare_unknown(state *s, const char * fn);
/// Display the results of clustering operations
void display_clusters(const state *s);
#endif // ifndef __MATCH_H
#! /bin/sh
# Common wrapper for a few potentially missing GNU programs.
scriptversion=2012-06-26.16; # UTC
# Copyright (C) 1996-2013 Free Software Foundation, Inc.
# Originally written by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
# configuration script generated by Autoconf, you may include it under
# the same distribution terms that you use for the rest of that program.
if test $# -eq 0; then
echo 1>&2 "Try '$0 --help' for more information"
exit 1
fi
case $1 in
--is-lightweight)
# Used by our autoconf macros to check whether the available missing
# script is modern enough.
exit 0
;;
--run)
# Back-compat with the calling convention used by older automake.
shift
;;
-h|--h|--he|--hel|--help)
echo "\
$0 [OPTION]... PROGRAM [ARGUMENT]...
Run 'PROGRAM [ARGUMENT]...', returning a proper advice when this fails due
to PROGRAM being missing or too old.
Options:
-h, --help display this help and exit
-v, --version output version information and exit
Supported PROGRAM values:
aclocal autoconf autoheader autom4te automake makeinfo
bison yacc flex lex help2man
Version suffixes to PROGRAM as well as the prefixes 'gnu-', 'gnu', and
'g' are ignored when checking the name.
Send bug reports to <bug-automake@gnu.org>."
exit $?
;;
-v|--v|--ve|--ver|--vers|--versi|--versio|--version)
echo "missing $scriptversion (GNU Automake)"
exit $?
;;
-*)
echo 1>&2 "$0: unknown '$1' option"
echo 1>&2 "Try '$0 --help' for more information"
exit 1
;;
esac
# Run the given program, remember its exit status.
"$@"; st=$?
# If it succeeded, we are done.
test $st -eq 0 && exit 0
# Also exit now if we it failed (or wasn't found), and '--version' was
# passed; such an option is passed most likely to detect whether the
# program is present and works.
case $2 in --version|--help) exit $st;; esac
# Exit code 63 means version mismatch. This often happens when the user
# tries to use an ancient version of a tool on a file that requires a
# minimum version.
if test $st -eq 63; then
msg="probably too old"
elif test $st -eq 127; then
# Program was missing.
msg="missing on your system"
else
# Program was found and executed, but failed. Give up.
exit $st
fi
perl_URL=http://www.perl.org/
flex_URL=http://flex.sourceforge.net/
gnu_software_URL=http://www.gnu.org/software
program_details ()
{
case $1 in
aclocal|automake)
echo "The '$1' program is part of the GNU Automake package:"
echo "<$gnu_software_URL/automake>"
echo "It also requires GNU Autoconf, GNU m4 and Perl in order to run:"
echo "<$gnu_software_URL/autoconf>"
echo "<$gnu_software_URL/m4/>"
echo "<$perl_URL>"
;;
autoconf|autom4te|autoheader)
echo "The '$1' program is part of the GNU Autoconf package:"
echo "<$gnu_software_URL/autoconf/>"
echo "It also requires GNU m4 and Perl in order to run:"
echo "<$gnu_software_URL/m4/>"
echo "<$perl_URL>"
;;
esac
}
give_advice ()
{
# Normalize program name to check for.
normalized_program=`echo "$1" | sed '
s/^gnu-//; t
s/^gnu//; t
s/^g//; t'`
printf '%s\n' "'$1' is $msg."
configure_deps="'configure.ac' or m4 files included by 'configure.ac'"
case $normalized_program in
autoconf*)
echo "You should only need it if you modified 'configure.ac',"
echo "or m4 files included by it."
program_details 'autoconf'
;;
autoheader*)
echo "You should only need it if you modified 'acconfig.h' or"
echo "$configure_deps."
program_details 'autoheader'
;;
automake*)
echo "You should only need it if you modified 'Makefile.am' or"
echo "$configure_deps."
program_details 'automake'
;;
aclocal*)
echo "You should only need it if you modified 'acinclude.m4' or"
echo "$configure_deps."
program_details 'aclocal'
;;
autom4te*)
echo "You might have modified some maintainer files that require"
echo "the 'automa4te' program to be rebuilt."
program_details 'autom4te'
;;
bison*|yacc*)
echo "You should only need it if you modified a '.y' file."
echo "You may want to install the GNU Bison package:"
echo "<$gnu_software_URL/bison/>"
;;
lex*|flex*)
echo "You should only need it if you modified a '.l' file."
echo "You may want to install the Fast Lexical Analyzer package:"
echo "<$flex_URL>"
;;
help2man*)
echo "You should only need it if you modified a dependency" \
"of a man page."
echo "You may want to install the GNU Help2man package:"
echo "<$gnu_software_URL/help2man/>"
;;
makeinfo*)
echo "You should only need it if you modified a '.texi' file, or"
echo "any other file indirectly affecting the aspect of the manual."
echo "You might want to install the Texinfo package:"
echo "<$gnu_software_URL/texinfo/>"
echo "The spurious makeinfo call might also be the consequence of"
echo "using a buggy 'make' (AIX, DU, IRIX), in which case you might"
echo "want to install GNU make:"
echo "<$gnu_software_URL/make/>"
;;
*)
echo "You might have modified some files without having the proper"
echo "tools for further handling them. Check the 'README' file, it"
echo "often tells you about the needed prerequisites for installing"
echo "this package. You may also peek at any GNU archive site, in"
echo "case some other package contains this missing '$1' program."
;;
esac
}
give_advice "$1" | sed -e '1s/^/WARNING: /' \
-e '2,$s/^/ /' >&2
# Propagate the correct exit status (expected to be 127 for a program
# not found, 63 for a program that failed due to version mismatch).
exit $st
# Local variables:
# eval: (add-hook 'write-file-hooks 'time-stamp)
# time-stamp-start: "scriptversion="
# time-stamp-format: "%:y-%02m-%02d.%02H"
# time-stamp-time-zone: "UTC"
# time-stamp-end: "; # UTC"
# End:
/* Fuzzy Hashing by Jesse Kornblum
Copyright (C) 2010 ManTech International Corporation
This program demonstrates some of the capabilities of
the fuzzy hashing library.
To compile the program using gcc:
$ gcc -Wall -I/usr/local/include -L/usr/local/lib sample.c -Lfuzzy
Using mingw:
C:\> gcc -Wall -Ic:\path\to\includes sample.c fuzzy.dll
Using Microsoft Visual C:
C:\> lib /machine:i386 /def:fuzzy.def
C:\> cl sample.c fuzzy.lib
See the README that came with this file for more details on using
the library on Windows systems with Microsoft Visual C.
The functions generate_random and write_data are generic routines to make
random data for hashing. The real magic happens in the main() function.
THIS SOFTWARE IS NOT DESIGNED OR INTENDED FOR USE OR RESALE AS ON-LINE
CONTROL EQUIPMENT IN HAZARDOUS ENVIRONMENTS REQUIRING FAIL-SAFE
PERFORMANCE, SUCH AS IN THE OPERATION OF NUCLEAR FACILITIES, AIRCRAFT
NAVIGATION OR COMMUNICATION SYSTEMS, AIR TRAFFIC CONTROL, DIRECT LIFE
SUPPORT MACHINES, OR WEAPONS SYSTEMS, IN WHICH THE FAILURE OF THE
SOFTWARE COULD LEAD DIRECTLY TO DEATH, PERSONAL INJURY, OR SEVERE
PHYSICAL OR ENVIRONMENTAL DAMAGE ("HIGH RISK ACTIVITIES"). THE AUTHOR
SPECIFICALLY DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY OF FITNESS FOR
HIGH RISK ACTIVITIES. */
// $Id: sample.c 97 2010-03-19 15:10:06Z jessekornblum $
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <fuzzy.h>
#define FILENAME "foo.dat"
#define SIZE 0x50000
void generate_random(unsigned char *buf, uint32_t sz)
{
uint32_t i;
for (i = 0 ; i < sz ; ++i)
buf[i] = (unsigned char)(rand() % 255);
buf[(sz-1)] = 0;
}
int write_data(const unsigned char *buf,
const uint32_t sz,
const char *fn)
{
printf ("Writing to %s\n", fn);
FILE * handle = fopen(fn,"wb");
if (NULL == handle)
return 1;
fwrite(buf,sz,1,handle);
fclose(handle);
return 0;
}
int main(int argc, char **argv)
{
unsigned char * buf;
char * result, * result2;
FILE *handle;
srand(1);
buf = (unsigned char *)malloc(SIZE);
result = (char *)malloc(FUZZY_MAX_RESULT);
result2 = (char *)malloc(FUZZY_MAX_RESULT);
if (NULL == result || NULL == buf || NULL == result2)
{
fprintf (stderr,"%s: Out of memory\n", argv[0]);
return EXIT_FAILURE;
}
generate_random(buf,SIZE);
if (write_data(buf,SIZE,FILENAME))
return EXIT_FAILURE;
printf ("Hashing buffer\n");
int status = fuzzy_hash_buf(buf,SIZE,result);
if (status)
printf ("Error during buf hash\n");
else
printf ("%s\n", result);
handle = fopen(FILENAME,"rb");
if (NULL == handle)
{
perror(FILENAME);
return EXIT_FAILURE;
}
printf ("Hashing file\n");
status = fuzzy_hash_file(handle,result);
if (status)
printf ("Error during file hash\n");
else
printf ("%s\n", result);
fclose(handle);
printf ("Modifying buffer and comparing to file\n");
int i;
for (i = 0x100 ; i < 0x110 ; ++i)
buf[i] = 37;
status = fuzzy_hash_buf(buf,SIZE,result2);
if (status)
printf ("Error during buffer hash\n");
else
printf ("%s\n", result2);
i = fuzzy_compare(result,result2);
if (-1 == i)
printf ("An error occured during matching\n");
else
{
if (i != 0)
printf ("MATCH: score = %d\n", i);
else
printf ("did not match\n");
}
return EXIT_SUCCESS;
}
.TH SSDEEP "1" "Version 2.10 \- 17 Jul 2013" "Facebook" "Facebook"
.SH NAME
ssdeep - Computes context triggered piecewise hashes (fuzzy hashes)
.SH SYNOPSIS
.B ssdeep [-m <file>] [-k <file>] [-vdprgsblcxa] [-t val] [FILES]
.br
.B ssdeep [-V|h]
.SH DESCRIPTION
.PP
Computes a signature based on context triggered piecewise hashes
for each input file, also called a fuzzy hash.
If requested, the program matches those signatures against
a file of known signatures and reports any possible matches.
It can also examine one or more files of signatures and find any
matches in those files.
Output is written to standard out and errors to standard error.
The program only accepts the first 100MB of data presented
via standard input.
.TP
\fB\-m <file>\fR
Loads the specified file of known hashes to be used for matching. This file must
be a previous output of the program. The program
then hashes each entry in FILES and compares these signatures to the known signatures.
Any matches which score above the threshold are displayed.
This flag may be used multiple times to load more known signatures.
This flag may not be used with the \-k or \-x flags.
.TP
\fB\-k <file>\fR
Load the specified file of known hashes to be used for matching. This file must
be a previous output of the program. The program
then treats each entry in FILES as a set of known hashes as well. The hashes in these
FILES are compared to the known hashes from this file. Matches which score
above the threshold are displayed. Both the file specified here and the
input FILES should contain fuzzy hashes.
This flag may be used multiple times to load more known signatures.
This flag may not be used with the \-m, \-d, or \-p flags.
.TP
\fB\-v\fR
Verbose mode. The name of each file is printed to standard error
as it is being hashed.
.TP
\fB\-d\fR
Computes a signature for each entry in the FILES and compares it to the set
of known signatures. Matches which score above the threshold are displayed. The
computed signature is then added to the set of known signatures.
This flag may not be used with the \-k or \-x flags.
.TP
\fB\-p\fR
Works like the \-d flag, but displays all matches for each file. That is,
for two files A and B which match score above the threshold, displays
"A matches B" and "B matches A".
This flag may not be used with the \-k or \-x flags.
.TP
\fB\-r\fR
Enables recursive mode. All subdirectories are traversed.
Please note that recursive mode cannot be used to examine all
files of a given file extension. For example, invoking the program with
\fB\-r *.txt\fR will examine all files in directories that end in .txt.
If you want to process all files in a directory tree with the .txt suffix,
try using the \fBfind(1)\fR command.
.TP
\fB\-g\fR
Similar files are grouped together into clusters. This can be handy
for finding more similar files. That is, if you are searching for file
A, which matches B, anything which matches B will also be included in
the cluster.
.TP
\fB\-s\fR
Silent mode. All error messages are suppressed.
.TP
\fB\-b\fR
Enables bare mode. Strips any leading directory information from
displayed filenames.
This flag may not be used in conjunction with the \fB\-l\fR flag.
.TP
\fB\-l\fR
Enables relative file paths. Instead of printing the absolute path for
each file, displays the relative file path as indicated on the command
line. This flag may not be used in conjunction with the \fB\-b\fR flag.
.TP
\fB\-c\fR
Enables comma separated output mode. In any of the matching modes
\-d, \-p, or \-m,
displays the results as input file, known file, matching score.
.TP
\fB\-x\fR
Signature file matching.
Each entry in FILES must contain signatures generated by a previous output
of the program. Each signature is loaded and compared against the set of
known hashes. Match scores above the threshold are displayed. Each signature
is then added to the set of knowns.
This flag may not be used with the \-m, \-d, or \-p flags.
.TP
\fB\-a\fR
Displays all matches in any of the matching mode, regardless of score.
Using the \-a flag displays all results, even if the match score is zero.
.TP
\fB\-t <val>\fR
In any of the matching modes, only display matches when match
score is greater than the given value. The default threshold value is zero.
.TP
\fB\-h\fR
Show a help screen and exit.
.TP
\fB\-V\fR
Show the version number and exit.
.SH RETURN VALUE
Returns 0 on success, 1 if there is a problem.
Read errors, permission denied, and encountering directories while
not in recursive mode are still considered successes. Problems are
things like being unable to load the matching file, specifying
both bare and relative paths, etc.
.SH AUTHOR
ssdeep was written by Jesse Kornblum of Facebook,
.br
research@jessekornblum.com
.PP
.SH COPYRIGHT
This program is Copyright (C) 2013 Facebook and is licensed under the terms
of the General Public License. See the file COPYING for details.
.SH SEE ALSO
This program is based on SpamSum by Dr. Andrews Tridgell.
.br
http://www.samba.org/ftp/unpacked/junkcode/spamsum/
#ifndef __SSDEEP_H
#define __SSDEEP_H
// Fuzzy Hashing by Jesse Kornblum
// Copyright (C) 2013 Facebook
// Copyright (C) 2012 Kyrus
// Copyright (C) 2008 ManTech International Corporation
//
// $Id: ssdeep.h 190 2013-07-11 00:40:22Z jessekornblum $
//
#include "main.h"
#include <string>
#include <map>
#include <set>
#include <vector>
#include "fuzzy.h"
#include "tchar-local.h"
#include "filedata.h"
// This is a kludge, but it works.
#define __progname "ssdeep"
#define SSDEEPV1_0_HEADER "ssdeep,1.0--blocksize:hash:hash,filename"
#define SSDEEPV1_1_HEADER "ssdeep,1.1--blocksize:hash:hash,filename"
#define OUTPUT_FILE_HEADER SSDEEPV1_1_HEADER
// We print a warning for files smaller than this size
#define SSDEEP_MIN_FILE_SIZE 4096
// The default 'PATH_MAX' on Windows is about 255 bytes. We can expand
// this limit to 32,767 characters by prepending filenames with "\\?\"
#define SSDEEP_PATH_MAX 32767
#define MD5DEEP_ALLOC(TYPE,VAR,SIZE) \
VAR = (TYPE *)malloc(sizeof(TYPE) * SIZE); \
if (NULL == VAR) \
return EXIT_FAILURE; \
memset(VAR,0,SIZE * sizeof(TYPE));
// These are the types of files we can encounter while hashing
#define file_regular 0
#define file_directory 1
#define file_door 2
#define file_block 3
#define file_character 4
#define file_pipe 5
#define file_socket 6
#define file_symlink 7
#define file_unknown 254
typedef struct _filedata_t
{
uint64_t id;
/// Original signature in the form [blocksize]:[sig1]:[sig2]
std::string signature;
uint64_t blocksize;
/// Holds signature equal to blocksize
std::string s1;
/// Holds signature equal to blocksize * 2
std::string s2;
TCHAR * filename;
/// File of hashes where we got this known file from.
std::string match_file;
/// Cluster which contains this file
std::set<_filedata_t> * cluster;
} filedata_t;
typedef struct {
uint64_t mode;
bool first_file_processed;
// Known hashes
std::vector<Filedata *> all_files;
// Known clusters
std::set< std::set<Filedata *> * > all_clusters;
/// Display files who score above the threshold
uint8_t threshold;
bool found_meaningful_file;
bool processed_file;
int argc;
TCHAR **argv;
/// Current line number in file of known hashes
uint64_t line_number;
/// File handle to file of known hashes
FILE * known_handle;
/// Filename of known hashes
char * known_fn;
} state;
#define MM_INIT printf
// Things required when cross compiling for Microsoft Windows
#ifdef _WIN32
// We create macros for the Windows equivalent UNIX functions.
// No worries about lstat to stat; Windows doesn't have symbolic links
#define lstat(A,B) stat(A,B)
#define realpath(A,B) _fullpath(B,A,PATH_MAX)
#define snprintf _snprintf
char *basename(char *a);
extern char *optarg;
extern int optind;
int getopt(int argc, char *const argv[], const char *optstring);
#define NEWLINE "\r\n"
#define DIR_SEPARATOR '\\'
#else // ifdef _WIN32
// For all other operating systems
#define NEWLINE "\n"
#define DIR_SEPARATOR '/'
#endif // ifdef _WIN32/else
// Because the modes are stored in a uint64_t variable, they must
// be less than or equal to 1<<63
#define mode_none 0
#define mode_recursive 1
#define mode_match 1<<1
#define mode_barename 1<<2
#define mode_relative 1<<3
#define mode_silent 1<<4
#define mode_directory 1<<5
#define mode_match_pretty 1<<6
#define mode_verbose 1<<7
#define mode_csv 1<<8
#define mode_threshold 1<<9
#define mode_sigcompare 1<<10
#define mode_display_all 1<<11
#define mode_compare_unknown 1<<12
#define mode_cluster 1<<13
#define mode_recursive_cluster 1<<14
#define MODE(A) (s->mode & A)
#define BLANK_LINE \
" "
// *********************************************************************
// Checking for cycles
// *********************************************************************
int done_processing_dir(TCHAR *fn);
int processing_dir(TCHAR *fn);
int have_processed_dir(TCHAR *fn);
bool process_win32(state *s, TCHAR *fn);
int process_normal(state *s, TCHAR *fn);
int process_stdin(state *s);
// *********************************************************************
// Fuzzy Hashing Engine
// *********************************************************************
int hash_file(state *s, TCHAR *fn);
bool display_result(state *s, const TCHAR * fn, const char * sum);
// *********************************************************************
// Helper functions
// *********************************************************************
void try_msg(void);
bool expanded_path(TCHAR *p);
void sanity_check(state *s, int condition, const char *msg);
// The basename function kept misbehaving on OS X, so I rewrote it.
// This function isn't perfect, nor is it designed to be. Because
// we're guarenteed to be working with a filename here, there's no way
// that s will end with a DIR_SEPARATOR (e.g. /foo/bar/). This function
// will not work properly for a string that ends in a DIR_SEPARATOR
int my_basename(TCHAR *s);
int my_dirname(TCHAR *s);
// Remove the newlines, if any, from the string. Works with both
// \r and \r\n style newlines
void chop_line_tchar(TCHAR *s);
void chop_line(char *s);
int find_comma_separated_string_tchar(TCHAR *s, unsigned int n);
void shift_string_tchar(TCHAR *fn, unsigned int start, unsigned int new_start);
int find_comma_separated_string(char *s, unsigned int n);
void shift_string(char *fn, size_t start, size_t new_start);
int remove_escaped_quotes(char * str);
void prepare_filename(state *s, TCHAR *fn);
// Returns the size of the given file, in bytes.
#ifdef __cplusplus
extern "C" {
#endif
off_t find_file_size(FILE *h);
#ifdef __cplusplus
}
#endif
// *********************************************************************
// User Interface Functions
// *********************************************************************
void print_status(const char *fmt, ...);
void print_error(const state *s, const char *fmt, ...);
void print_error_unicode(state *s, const TCHAR *fn, const char *fmt, ...);
void internal_error(const char *fmt, ... );
void fatal_error(const char *fmt, ... );
void display_filename(FILE *out, const TCHAR *fn, int escape_quotes);
#endif // #ifndef __SSDEEP_H
/* $Id: tchar-local.h 61 2008-02-22 23:18:59Z jessekornblum $ */
#ifndef __TCHAR_LOCAL_H
#define __TCHAR_LOCAL_H
/* Unicode support */
#ifdef _WIN32
// This says that we require Windows NT 4.0 to run
#define _WIN32_WINNT 0x0400
# include <windows.h>
# include <wchar.h>
# include <tchar.h>
/* The PRINTF_S character is used in situations where we have a string
with one TCHAR and one char argument. It's impossible to use the
_TEXT macro because we don't know which will be which. */
#define PRINTF_S "S"
#define _tmemmove wmemmove
/* The Win32 API does have lstat, just stat. As such, we don't have to
worry about the difference between the two. */
#define _lstat _tstat
#define _sstat _tstat
#define _tstat_t struct _stat
#else // ifdef _WIN32
#define PRINTF_S "s"
/* The next few paragraphs are similar to tchar.h when UNICODE
is not defined. They define all of the _t* functions to use
the standard char * functions. This works just fine on Linux and OS X */
#define TCHAR char
#define _TDIR DIR
#define _TEXT(A) A
#define _sntprintf snprintf
#define _tprintf printf
#define _ftprintf fprintf
#define _lstat lstat
#define _sstat stat
#define _tstat_t struct stat
#define _tgetcwd getcwd
#define _tfopen fopen
#define _fgetts fgets
#define _topendir opendir
#define _treaddir readdir
#define _tdirent dirent
#define _tclosedir closedir
#define _tcsncpy strncpy
#define _tcslen strlen
#define _tcsnicmp strncasecmp
#define _tcsncmp strncmp
#define _tcsrchr strrchr
#define _tmemmove memmove
#define _tcsdup strdup
#define _tcsstr strstr
#endif
#endif // __TCHAR_LOCAL_H
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment