/* ssdeep
 * Copyright (C) 2002 Andrew Tridgell <tridge@samba.org>
 * Copyright (C) 2006 ManTech International Corporation
 * Copyright (C) 2013 Helmut Grohne <helmut@subdivi.de>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Earlier versions of this code were named fuzzy.c and can be found at:
 *     http://www.samba.org/ftp/unpacked/junkcode/spamsum/
 *     http://ssdeep.sf.net/
 */

#include <assert.h>
#include <errno.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "fuzzy.h"

#if defined(__GNUC__) && __GNUC__ >= 3
#define likely(x)       __builtin_expect(!!(x), 1)
#define unlikely(x)     __builtin_expect(!!(x), 0)
#else
#define likely(x) x
#define unlikely(x) x
#endif

#ifndef MIN
#define MIN(a,b) ((a)<(b)?(a):(b))
#endif

#ifndef MAX
#define MAX(a,b) ((a)>(b)?(a):(b))
#endif

#define ROLLING_WINDOW 7
#define MIN_BLOCKSIZE 3
#define HASH_PRIME 0x01000193
#define HASH_INIT 0x28021967
#define NUM_BLOCKHASHES 31

struct roll_state {
  unsigned char window[ROLLING_WINDOW];
  uint32_t h1, h2, h3;
  uint32_t n;
};

static void roll_init(/*@out@*/ struct roll_state *self) {
	memset(self, 0, sizeof(struct roll_state));
}

/*
 * a rolling hash, based on the Adler checksum. By using a rolling hash
 * we can perform auto resynchronisation after inserts/deletes

 * internally, h1 is the sum of the bytes in the window and h2
 * is the sum of the bytes times the index

 * h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
 * we can cope with large blocksize values
 */
static void roll_hash(struct roll_state *self, unsigned char c)
{
  self->h2 -= self->h1;
  self->h2 += ROLLING_WINDOW * (uint32_t)c;

  self->h1 += (uint32_t)c;
  self->h1 -= (uint32_t)self->window[self->n % ROLLING_WINDOW];

  self->window[self->n % ROLLING_WINDOW] = c;
  self->n++;

  /* The original spamsum AND'ed this value with 0xFFFFFFFF which
   * in theory should have no effect. This AND has been removed
   * for performance (jk) */
  self->h3 <<= 5;
  self->h3 ^= c;
}

static uint32_t roll_sum(const struct roll_state *self)
{
  return self->h1 + self->h2 + self->h3;
}

/* A simple non-rolling hash, based on the FNV hash. */
static uint32_t sum_hash(unsigned char c, uint32_t h)
{
  return (h * HASH_PRIME) ^ c;
}

/* A blockhash contains a signature state for a specific (implicit) blocksize.
 * The blocksize is given by SSDEEP_BS(index). The h and halfh members are the
 * FNV hashes, where halfh stops to be reset after digest is SPAMSUM_LENGTH/2
 * long. The halfh hash is needed be able to truncate digest for the second
 * output hash to stay compatible with ssdeep output. */
struct blockhash_context
{
  uint32_t h, halfh;
  char digest[SPAMSUM_LENGTH];
  unsigned int dlen;
};

struct fuzzy_state
{
  unsigned int bhstart, bhend;
  struct blockhash_context bh[NUM_BLOCKHASHES];
  size_t total_size;
  struct roll_state roll;
};

#define SSDEEP_BS(index) (((uint32_t)MIN_BLOCKSIZE) << (index))

/*@only@*/ /*@null@*/ struct fuzzy_state *fuzzy_new(void)
{
  struct fuzzy_state *self;
  if(NULL == (self = malloc(sizeof(struct fuzzy_state))))
    /* malloc sets ENOMEM */
    return NULL;
  self->bhstart = 0;
  self->bhend = 1;
  self->bh[0].h = HASH_INIT;
  self->bh[0].halfh = HASH_INIT;
  self->bh[0].dlen = 0;
  self->total_size = 0;
  roll_init(&self->roll);
  return self;
}

static void fuzzy_try_fork_blockhash(struct fuzzy_state *self)
{
  struct blockhash_context *obh, *nbh;
  if (self->bhend >= NUM_BLOCKHASHES)
    return;
  assert(self->bhend > 0);
  obh = self->bh + (self->bhend - 1);
  nbh = obh + 1;
  nbh->h = obh->h;
  nbh->halfh = obh->halfh;
  nbh->dlen = 0;
  ++self->bhend;
}

static void fuzzy_try_reduce_blockhash(struct fuzzy_state *self)
{
  assert(self->bhstart < self->bhend);
  if (self->bhend - self->bhstart < 2)
    /* Need at least two working hashes. */
    return;
  if ((size_t)SSDEEP_BS(self->bhstart) * SPAMSUM_LENGTH >=
      self->total_size)
    /* Initial blocksize estimate would select this or a smaller
     * blocksize. */
    return;
  if (self->bh[self->bhstart + 1].dlen < SPAMSUM_LENGTH / 2)
    /* Estimate adjustment would select this blocksize. */
    return;
  /* At this point we are clearly no longer interested in the
   * start_blocksize. Get rid of it. */
  ++self->bhstart;
}

static const char *b64 =
  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

static void fuzzy_engine_step(struct fuzzy_state *self, unsigned char c)
{
  size_t h;
  unsigned int i;
  /* At each character we update the rolling hash and the normal hashes.
   * When the rolling hash hits a reset value then we emit a normal hash
   * as a element of the signature and reset the normal hash. */
  roll_hash(&self->roll, c);
  h = roll_sum(&self->roll);

  for (i = self->bhstart; i < self->bhend; ++i)
  {
    self->bh[i].h = sum_hash(c, self->bh[i].h);
    self->bh[i].halfh = sum_hash(c, self->bh[i].halfh);
  }

  for (i = self->bhstart; i < self->bhend; ++i)
  {
    /* With growing blocksize almost no runs fail the next test. */
    if (likely(h % SSDEEP_BS(i) != SSDEEP_BS(i) - 1))
      /* Once this condition is false for one bs, it is
       * automatically false for all further bs. I.e. if
       * h === -1 (mod 2*bs) then h === -1 (mod bs). */
      break;
    /* We have hit a reset point. We now emit hashes which are
     * based on all characters in the piece of the message between
     * the last reset point and this one */
    if (unlikely(0 == self->bh[i].dlen)) {
      /* Can only happen 30 times. */
      /* First step for this blocksize. Clone next. */
      fuzzy_try_fork_blockhash(self);
    }
    if (self->bh[i].dlen < SPAMSUM_LENGTH - 1) {
      /* We can have a problem with the tail overflowing. The
       * easiest way to cope with this is to only reset the
       * normal hash if we have room for more characters in
       * our signature. This has the effect of combining the
       * last few pieces of the message into a single piece
       * */
      self->bh[i].digest[self->bh[i].dlen++] =
	b64[self->bh[i].h % 64];
      self->bh[i].h = HASH_INIT;
      if (self->bh[i].dlen < SPAMSUM_LENGTH / 2)
	self->bh[i].halfh = HASH_INIT;
    } else
      fuzzy_try_reduce_blockhash(self);
  }
}

int fuzzy_update(struct fuzzy_state *self,
		 const unsigned char *buffer,
		 size_t buffer_size) {
  self->total_size += buffer_size;
  for ( ;buffer_size > 0; ++buffer, --buffer_size)
    fuzzy_engine_step(self, *buffer);
  return 0;
}

static int memcpy_eliminate_sequences(char *dst,
				      const char *src,
				      int n)
{
  const char *srcend = src + n;
  assert(n >= 0);
  if (src < srcend) *dst++ = *src++;
  if (src < srcend) *dst++ = *src++;
  if (src < srcend) *dst++ = *src++;
  while (src < srcend)
    if (*src == dst[-1] && *src == dst[-2] && *src == dst[-3])
    {
      ++src;
      --n;
    } else
      *dst++ = *src++;
  return n;
}

#ifdef S_SPLINT_S
extern const int EOVERFLOW;
#endif

// We need some extra help on Win32
#ifdef _WIN32
# define EOVERFLOW 84
# define ftello    ftell
# define fseeko    fseek
#endif

int fuzzy_digest(const struct fuzzy_state *self,
		 /*@out@*/ char *result,
		 unsigned int flags)
{
  unsigned int bi = self->bhstart;
  uint32_t h = roll_sum(&self->roll);
  int i, remain = FUZZY_MAX_RESULT - 1; /* Exclude terminating '\0'. */
  /* Verify that our elimination was not overeager. */
  assert(bi == 0 || (size_t)SSDEEP_BS(bi) / 2 * SPAMSUM_LENGTH <
	 self->total_size);

  /* Initial blocksize guess. */
  while ((size_t)SSDEEP_BS(bi) * SPAMSUM_LENGTH < self->total_size) {
    ++bi;
    if (bi >= NUM_BLOCKHASHES) {
      /* The input exceeds data types. */
      errno = EOVERFLOW;
      return -1;
    }
  }
  /* Adapt blocksize guess to actual digest length. */
  while (bi >= self->bhend)
    --bi;
  while (bi > self->bhstart && self->bh[bi].dlen < SPAMSUM_LENGTH / 2)
    --bi;
  assert (!(bi > 0 && self->bh[bi].dlen < SPAMSUM_LENGTH / 2));

  i = snprintf(result, (size_t)remain, "%u:", SSDEEP_BS(bi));
  if (i <= 0)
    /* Maybe snprintf has set errno here? */
    return -1;
  assert(i < remain);
  remain -= i;
  result += i;
  i = (int)self->bh[bi].dlen;
  assert(i <= remain);
  if ((flags & FUZZY_FLAG_ELIMSEQ) != 0)
    i = memcpy_eliminate_sequences(result, self->bh[bi].digest, i);
  else
    memcpy(result, self->bh[bi].digest, (size_t)i);
  result += i;
  remain -= i;
  if (h != 0)
  {
    assert(remain > 0);
    *result = b64[self->bh[bi].h % 64];
    if((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 ||
       *result != result[-1] ||
       *result != result[-2] ||
       *result != result[-3]) {
      ++result;
      --remain;
    }
  }
  assert(remain > 0);
  *result++ = ':';
  --remain;
  if (bi < self->bhend - 1)
  {
    ++bi;
    i = (int)self->bh[bi].dlen;
    if ((flags & FUZZY_FLAG_NOTRUNC) == 0 &&
	i > SPAMSUM_LENGTH / 2 - 1)
      i = SPAMSUM_LENGTH / 2 - 1;
    assert(i <= remain);
    if ((flags & FUZZY_FLAG_ELIMSEQ) != 0)
      i = memcpy_eliminate_sequences(result,
				     self->bh[bi].digest, i);
    else
      memcpy(result, self->bh[bi].digest, (size_t)i);
    result += i;
    remain -= i;
    if (h != 0) {
      assert(remain > 0);
      h = (flags & FUZZY_FLAG_NOTRUNC) != 0 ? self->bh[bi].h :
	self->bh[bi].halfh;
      *result = b64[h % 64];
      if ((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 ||
	  *result != result[-1] ||
	  *result != result[-2] ||
	  *result != result[-3])
      {
	++result;
	--remain;
      }
    }
  } else if (h != 0)
    {
      assert(self->bh[bi].dlen == 0);
      assert(remain > 0);
      *result++ = b64[self->bh[bi].h % 64];
      /* No need to bother with FUZZY_FLAG_ELIMSEQ, because this
       * digest has length 1. */
      --remain;
    }
  *result = '\0';
  return 0;
}

void fuzzy_free(/*@only@*/ struct fuzzy_state *self)
{
  free(self);
}

int fuzzy_hash_buf(const unsigned char *buf,
		   uint32_t buf_len,
		   /*@out@*/ char *result)
{
  struct fuzzy_state *ctx;
  int ret = -1;
  if (NULL == (ctx = fuzzy_new()))
    return -1;
  if (fuzzy_update(ctx, buf, buf_len) < 0)
    goto out;
  if (fuzzy_digest(ctx, result, 0) < 0)
    goto out;
  ret = 0;
 out:
  fuzzy_free(ctx);
  return ret;
}

int fuzzy_hash_stream(FILE *handle, /*@out@*/ char *result)
{
  struct fuzzy_state *ctx;
  unsigned char buffer[4096];
  size_t n;
  int ret = -1;
  if (NULL == (ctx = fuzzy_new()))
    return -1;
  for(;;)
  {
    n = fread(buffer, 1, 4096, handle);
    if (0 == n)
      break;
    if (fuzzy_update(ctx, buffer, n) < 0)
      goto out;
  }
  if (ferror(handle) != 0)
    goto out;
  if (fuzzy_digest(ctx, result, 0) < 0)
    goto out;
  ret = 0;
 out:
  fuzzy_free(ctx);
  return ret;
}

#ifdef S_SPLINT_S
typedef size_t off_t;
int fseeko(FILE *, off_t, int);
off_t ftello(FILE *);
#endif

int fuzzy_hash_file(FILE *handle, /*@out@*/ char *result)
{
  off_t fpos;
  int status;
  fpos = ftello(handle);
  if (fseek(handle, 0, SEEK_SET) < 0)
    return -1;
  status = fuzzy_hash_stream(handle, result);
  if (status == 0)
  {
    if (fseeko(handle, fpos, SEEK_SET) < 0)
      return -1;
  }
  return status;
}

int fuzzy_hash_filename(const char *filename, /*@out@*/ char *result)
{
  int status;
  FILE *handle = fopen(filename, "rb");
  if (NULL == handle)
    return -1;
  status = fuzzy_hash_stream(handle, result);
  /* We cannot do anything about an fclose failure. */
  (void)fclose(handle);
  return status;
}

//
// We only accept a match if we have at least one common substring in
// the signature of length ROLLING_WINDOW. This dramatically drops the
// false positive rate for low score thresholds while having
// negligable affect on the rate of spam detection.
//
// return 1 if the two strings do have a common substring, 0 otherwise
//
static int has_common_substring(const char *s1, const char *s2)
{
  int i, j;
  int num_hashes;
  uint32_t hashes[SPAMSUM_LENGTH];

  // there are many possible algorithms for common substring
  // detection. In this case I am re-using the rolling hash code
  // to act as a filter for possible substring matches

  memset(hashes, 0, sizeof(hashes));

  // first compute the windowed rolling hash at each offset in
  // the first string
  struct roll_state state;
  roll_init (&state);

  for (i=0;s1[i];i++)
  {
    roll_hash(&state, (unsigned char)s1[i]);
    hashes[i] = roll_sum(&state);
  }
  num_hashes = i;

  roll_init(&state);

  // now for each offset in the second string compute the
  // rolling hash and compare it to all of the rolling hashes
  // for the first string. If one matches then we have a
  // candidate substring match. We then confirm that match with
  // a direct string comparison */
  for (i=0;s2[i];i++)
  {
    roll_hash(&state, (unsigned char)s2[i]);
    uint32_t h = roll_sum(&state);
    if (i < ROLLING_WINDOW-1) continue;
    for (j=ROLLING_WINDOW-1;j<num_hashes;j++)
    {
      if (hashes[j] != 0 && hashes[j] == h)
      {
	// we have a potential match - confirm it
	if (strlen(s2+i-(ROLLING_WINDOW-1)) >= ROLLING_WINDOW &&
	    strncmp(s2+i-(ROLLING_WINDOW-1),
		    s1+j-(ROLLING_WINDOW-1),
		    ROLLING_WINDOW) == 0)
	{
	  return 1;
	}
      }
    }
  }

  return 0;
}


// eliminate sequences of longer than 3 identical characters. These
// sequences contain very little information so they tend to just bias
// the result unfairly
static char *eliminate_sequences(const char *str)
{
  char *ret;
  size_t i, j, len;

  ret = strdup(str);
  if (!ret)
    return NULL;

  len = strlen(str);
  if (len < 3)
    return ret;

  for (i=j=3 ; i<len ; i++)
  {
    if (str[i] != str[i-1] ||
	str[i] != str[i-2] ||
	str[i] != str[i-3])
    {
      ret[j++] = str[i];
    }
  }

  ret[j] = 0;

  return ret;
}

//
// this is the low level string scoring algorithm. It takes two strings
// and scores them on a scale of 0-100 where 0 is a terrible match and
// 100 is a great match. The block_size is used to cope with very small
// messages.
//
static uint32_t score_strings(const char *s1,
			      const char *s2,
			      unsigned int block_size)
{
  uint32_t score;
  size_t len1, len2;
  int edit_distn(const char *from, int from_len, const char *to, int to_len);

  len1 = strlen(s1);
  len2 = strlen(s2);

  if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
    // not a real spamsum signature?
    return 0;
  }

  // the two strings must have a common substring of length
  // ROLLING_WINDOW to be candidates
  if (has_common_substring(s1, s2) == 0) {
    return 0;
  }

  // compute the edit distance between the two strings. The edit distance gives
  // us a pretty good idea of how closely related the two strings are
  score = edit_distn(s1, len1, s2, len2);

  // scale the edit distance by the lengths of the two
  // strings. This changes the score to be a measure of the
  // proportion of the message that has changed rather than an
  // absolute quantity. It also copes with the variability of
  // the string lengths.
  score = (score * SPAMSUM_LENGTH) / (len1 + len2);

  // at this stage the score occurs roughly on a 0-64 scale,
  // with 0 being a good match and 64 being a complete
  // mismatch

  // rescale to a 0-100 scale (friendlier to humans)
  score = (100 * score) / 64;

  // it is possible to get a score above 100 here, but it is a
  // really terrible match
  if (score >= 100)
    return 0;

  // now re-scale on a 0-100 scale with 0 being a poor match and
  // 100 being a excellent match.
  score = 100 - score;

  //  printf ("len1: %"PRIu32"  len2: %"PRIu32"\n", len1, len2);

  // when the blocksize is small we don't want to exaggerate the match size
  if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2))
  {
    score = block_size/MIN_BLOCKSIZE * MIN(len1, len2);
  }
  return score;
}

//
// Given two spamsum strings return a value indicating the degree
// to which they match.
//
int fuzzy_compare(const char *str1, const char *str2)
{
  unsigned int block_size1, block_size2;
  uint32_t score = 0;
  char *s1, *s2;
  char *s1_1, *s1_2, *s1_3;
  char *s2_1, *s2_2, *s2_3;

  if (NULL == str1 || NULL == str2)
    return -1;

  // each spamsum is prefixed by its block size
  if (sscanf(str1, "%u:", &block_size1) != 1 ||
      sscanf(str2, "%u:", &block_size2) != 1) {
    return -1;
  }

  // if the blocksizes don't match then we are comparing
  // apples to oranges. This isn't an 'error' per se. We could
  // have two valid signatures, but they can't be compared.
  if (block_size1 != block_size2 &&
      block_size1 != block_size2*2 &&
      block_size2 != block_size1*2) {
    return 0;
  }

  // move past the prefix
  str1 = strchr(str1, ':');
  str2 = strchr(str2, ':');

  if (!str1 || !str2) {
    // badly formed ...
    return -1;
  }

  // there is very little information content is sequences of
  // the same character like 'LLLLL'. Eliminate any sequences
  // longer than 3. This is especially important when combined
  // with the has_common_substring() test below.
  // NOTE: This function duplciates str1 and str2
  s1 = eliminate_sequences(str1+1);
  if (!s1)
    return 0;
  s2 = eliminate_sequences(str2+1);
  if (!s2)
  {
    free(s1);
    return 0;
  }

  // now break them into the two pieces
  s1_1 = s1;
  s2_1 = s2;

  s1_2 = strchr(s1, ':');
  s2_2 = strchr(s2, ':');

  if (!s1_2 || !s2_2) {
    // a signature is malformed - it doesn't have 2 parts
    free(s1); free(s2);
    return -1;
  }

  // Chop the first substring. We terminate the first substring
  // and then advance the pointer to the start of the second substring.
  *s1_2 = 0;
  s1_2++;
  *s2_2 = 0;
  s2_2++;

  // Chop the second string at the comma--just before the filename.
  // If the strings don't have a comma (i.e. don't have a filename)
  // that's ok. It's not an error. This function can be called on
  // signatures which don't have filenames attached.
  // We also don't have to advance past the comma however. We don't care
  // about the filename
  s1_3 = strchr(s1_2, ',');
  s2_3 = strchr(s2_2, ',');
  if (s1_3 != NULL)
    *s1_3 = 0;
  if (s2_3 != NULL)
    *s2_3 = 0;

  // each signature has a string for two block sizes. We now
  // choose how to combine the two block sizes. We checked above
  // that they have at least one block size in common
  if (block_size1 == block_size2)
  {
    uint32_t score1, score2;
    score1 = score_strings(s1_1, s2_1, block_size1);
    score2 = score_strings(s1_2, s2_2, block_size1*2);
    score = MAX(score1, score2);
  }
  else if (block_size1 == block_size2*2)
  {
    score = score_strings(s1_1, s2_2, block_size1);
  }
  else
  {
    score = score_strings(s1_2, s2_1, block_size2);
  }

  free(s1);
  free(s2);

  return (int)score;
}