Unverified Commit 65378d4d by Wesley Shields Committed by GitHub

Consolidate PRs into single branch (#219)

* Support xor_value in returned strings.

Extend the tuple that represents an instance of a match to include the xor key.
This breaks all existing scripts that are unpacking the tuple, which I'm not
very happy with.

This also updates the submodule to use the latest master so that I can get the
new xor key values.

Also, adds a fix to get yara building here by defining BUCKETS_128 and
CHECKSUM_1B as needed by the new tlsh stuff (discussed with @metthal).

* Add two new objects to yara-python.

Add a StringMatch object, which represents a matched string. It has an
identifier member (this is the string identifier, eg: $a) and an instances
member which contains a list of matched string instances.

It also keeps track of the string flags internally but does not expose them
directly as the string flags contain things that are internal to YARA (eg:
STRING_FLAGS_FITS_IN_ATOM). The reason it keeps track of the string modifiers
is so that it can be extended to allow users to take action based upon certain
flags. For example, there is a "is_xor()" member on StringMatch which will
return True if the string is using the xor modifier. This way users can call
another method (discussed below) to get the plaintext string back.

Add a StringMatchInstance object which represents an instance of a matched
string. It contains the offset, matched data and the xor key used to match the
string (this is ALWAYS set, even to 0 if the string is not an xor string).

There is a "plaintext()" method on the StringMatchInstance objects which will
return a new bytes object with the xor key applied. This allows users to do
something like this:

```
print(instance.plaintext() if string.is_xor() else instance.matched_data)
```

Technically, the plaintext() method will return the matched_data if the xor_key
is 0 so they don't need to do the conditional but this allows them a nice way to
know if the xor_key is worth recording along with the plaintext.

I decided not to implement richcompare for these new objects as it isn't
entirely clear what I would want to do the comparison on.

* Add "matched_length" member.

Add a "matched_length" member to match instances. This is useful when the
"matched_data" member is a subset of the actually matched data.

Add a test for this that sets the max_match_data config to 2 and then checks to
make sure the "matched_length" and "matched_data" members are correct.

* Add modules list to yara object.

Add support for getting the list of available modules. It is available just by
accessing the yara.modules attribute, which contains a list of available
modules.

>>> print('\n'.join(yara.modules))
tests
pe
elf
math
time
console
>>>

Note: This commit also brings in the necessary defines to build the authenticode
parser, which is also done in the xor_value branch. Also, this commit updates
the yara submodule which will likely overwrite the changes done in the xor_value
so I recommend updating the submodule after both are merged.

* Update yara to 65feab41d4cbf4a75338561d8506fc1fa9fa6ba6.

* Fix test using \t in a regex.

* Fix build on Windows in appveyor.

* Actually fix appveyor builds on windows?
parent 42ccdd39
......@@ -155,7 +155,7 @@ build_script:
- "%CMD_IN_ENV% python setup.py build_ext --enable-cuckoo --enable-dotnet
-L../jansson-%JANSSON_VERSION%/build/lib/Release;../openssl/lib
-I../jansson-%JANSSON_VERSION%/build/include;../openssl/include
-DHASH_MODULE,HAVE_LIBCRYPTO
-DHASH_MODULE,HAVE_LIBCRYPTO,BUCKETS_128,CHECKSUM_1B
-llibcrypto"
after_build:
......
......@@ -188,6 +188,12 @@ class BuildExtCommand(build_ext):
exclusions = []
# Needed to build tlsh
module.define_macros.extend([('BUCKETS_128', 1), ('CHECKSUM_1B', 1)])
# Needed to build authenticode parser
module.libraries.append('ssl')
for define in self.define or []:
module.define_macros.append(define)
......@@ -371,4 +377,5 @@ setup(
ext_modules=[Extension(
name='yara',
include_dirs=['yara/libyara/include', 'yara/libyara/', '.'],
define_macros=[('BUCKETS_128', 1), ('CHECKSUM_1B', 1)],
sources=['yara-python.c'])])
......@@ -306,11 +306,12 @@ class TestYara(unittest.TestCase):
matches = rule.match(data=string)
if expected_result == SUCCEED:
self.assertTrue(matches)
_, _, matching_string = matches[0].strings[0]
matching_string = matches[0].strings[0]
instance = matching_string.instances[0]
if sys.version_info[0] >= 3:
self.assertTrue(matching_string == bytes(test[3], 'utf-8'))
self.assertTrue(instance.matched_data == bytes(test[3], 'utf-8'))
else:
self.assertTrue(matching_string == test[3])
self.assertTrue(instance.matched_data == test[3])
else:
self.assertFalse(matches)
......@@ -559,9 +560,13 @@ class TestYara(unittest.TestCase):
matches = rules.match(data='abbb')
if sys.version_info[0] >= 3:
self.assertTrue(matches[0].strings == [(0, '$a', bytes('ab', 'utf-8'))])
self.assertTrue(matches[0].strings[0].identifier == '$a')
self.assertTrue(matches[0].strings[0].instances[0].offset == 0)
self.assertTrue(matches[0].strings[0].instances[0].matched_data == bytes('ab', 'utf-8'))
else:
self.assertTrue(matches[0].strings == [(0, '$a', 'ab')])
self.assertTrue(matches[0].strings[0].identifier == '$a')
self.assertTrue(matches[0].strings[0].instances[0].offset == 0)
self.assertTrue(matches[0].strings[0].instances[0].matched_data == 'ab')
def testCount(self):
......@@ -650,6 +655,58 @@ class TestYara(unittest.TestCase):
'rule test { strings: $a = "ssi" condition: for all i in (1..#a) : (@a[i] == 5) }',
], 'mississipi')
def testXorKey(self):
global rule_data
rule_data = None
def callback(data):
global rule_data
rule_data = data
return yara.CALLBACK_CONTINUE
r = yara.compile(source='rule test { strings: $a = "dummy" xor(1-2) condition: $a }')
r.match(data='etllxfwoo{', callback=callback)
self.assertTrue(rule_data['matches'])
self.assertEqual(rule_data['rule'], 'test')
self.assertEqual(len(rule_data['strings']), 1)
string = rule_data['strings'][0]
self.assertEqual(len(string.instances), 2)
self.assertEqual(string.instances[0].xor_key, 1)
self.assertEqual(string.instances[1].xor_key, 2)
# Make sure plaintext() works.
self.assertTrue(string.instances[0].plaintext() == b'dummy')
# Test that the xor_key for matched strings is 0 if the string is not an xor
# string. We always want to make sure this is set!
def testXorKeyNoXorString(self):
global rule_data
rule_data = None
def callback(data):
global rule_data
rule_data = data
return yara.CALLBACK_CONTINUE
r = yara.compile(source='rule test { strings: $a = "dummy" condition: $a }')
r.match(data='dummy', callback=callback)
self.assertTrue(rule_data['matches'])
self.assertEqual(rule_data['rule'],'test')
self.assertEqual(len(rule_data['strings']), 1)
self.assertEqual(rule_data['strings'][0].instances[0].xor_key, 0)
def testMatchedLength(self):
yara.set_config(max_match_data=2)
r = yara.compile(source='rule test { strings: $a = "dummy" condition: $a }')
matches = r.match(data='dummy')
self.assertEqual(matches[0].strings[0].instances[0].matched_length, 5)
self.assertEqual(matches[0].strings[0].instances[0].matched_data, b'du')
yara.set_config(max_match_data=512)
def testRE(self):
self.assertTrueRules([
......@@ -661,8 +718,8 @@ class TestYara(unittest.TestCase):
'rule test { strings: $a = /(M|N)iss/ nocase condition: $a }',
'rule test { strings: $a = /[M-N]iss/ nocase condition: $a }',
'rule test { strings: $a = /(Mi|ssi)ssippi/ nocase condition: $a }',
'rule test { strings: $a = /ppi\tmi/ condition: $a }',
r'rule test { strings: $a = /ppi\.mi/ condition: $a }',
r'rule test { strings: $a = /ppi\tmi/ condition: $a }',
'rule test { strings: $a = /ppi\.mi/ condition: $a }',
'rule test { strings: $a = /^mississippi/ fullword condition: $a }',
'rule test { strings: $a = /mississippi.*mississippi$/s condition: $a }',
], 'mississippi\tmississippi.mississippi\nmississippi')
......
yara @ 65feab41
Subproject commit b77e4f45b4662af320c999d4ee559e1f3bc61226
Subproject commit 65feab41d4cbf4a75338561d8506fc1fa9fa6ba6
......@@ -205,6 +205,231 @@ static PyTypeObject Match_Type = {
0, /* tp_new */
};
// StringMatch object
typedef struct
{
PyObject_HEAD
PyObject* identifier;
PyObject* instances;
// This is not exposed directly because it contains flags that are internal
// to yara (eg: STRING_FLAGS_FITS_IN_ATOM) along with modifiers
// (eg: STRING_FLAGS_XOR).
uint64_t flags;
} StringMatch;
static PyMemberDef StringMatch_members[] = {
{
"identifier",
T_OBJECT_EX,
offsetof(StringMatch, identifier),
READONLY,
"Name of the matching string"
},
{
"instances",
T_OBJECT_EX,
offsetof(StringMatch, instances),
READONLY,
"StringMatchInstance objects of the matching string"
},
{ NULL } // End marker
};
static PyObject* StringMatch_NEW(
const char* identifier,
uint64_t flags,
PyObject* instance_list);
static void StringMatch_dealloc(
PyObject* self);
static PyObject* StringMatch_repr(
PyObject* self);
static PyObject* StringMatch_getattro(
PyObject* self,
PyObject* name);
static Py_hash_t StringMatch_hash(
PyObject* self);
static PyObject* StringMatch_is_xor(
PyObject* self,
PyObject* args);
static PyMethodDef StringMatch_methods[] =
{
{
"is_xor",
(PyCFunction) StringMatch_is_xor,
METH_NOARGS,
"Return true if a string has the xor modifier"
},
{ NULL },
};
static PyTypeObject StringMatch_Type = {
PyVarObject_HEAD_INIT(NULL, 0)
"yara.StringMatch", /*tp_name*/
sizeof(StringMatch), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)StringMatch_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
StringMatch_repr, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
StringMatch_hash, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
StringMatch_getattro, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
"StringMatch class", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */ // XXX: Implement richcompare?
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
StringMatch_methods, /* tp_methods */
StringMatch_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
0, /* tp_alloc */
0, /* tp_new */
};
// StringMatchInstance object
typedef struct
{
PyObject_HEAD
PyObject* offset;
PyObject* matched_data;
PyObject* matched_length;
PyObject* xor_key;
} StringMatchInstance;
static PyMemberDef StringMatchInstance_members[] = {
{
"offset",
T_OBJECT_EX,
offsetof(StringMatchInstance, offset),
READONLY,
"Offset of the matched data"
},
{
"matched_data",
T_OBJECT_EX,
offsetof(StringMatchInstance, matched_data),
READONLY,
"Matched data"
},
{
"matched_length",
T_OBJECT_EX,
offsetof(StringMatchInstance, matched_length),
READONLY,
"Length of matched data"
},
{
"xor_key",
T_OBJECT_EX,
offsetof(StringMatchInstance, xor_key),
READONLY,
"XOR key found for xor strings"
},
{ NULL } // End marker
};
static PyObject* StringMatchInstance_NEW(
uint64_t offset,
PyObject* matched_data,
int32_t match_length,
uint8_t xor_key);
static void StringMatchInstance_dealloc(
PyObject* self);
static PyObject* StringMatchInstance_repr(
PyObject* self);
static PyObject* StringMatchInstance_getattro(
PyObject* self,
PyObject* name);
static Py_hash_t StringMatchInstance_hash(
PyObject* self);
static PyObject* StringMatchInstance_plaintext(
PyObject* self,
PyObject* args);
static PyMethodDef StringMatchInstance_methods[] =
{
{
"plaintext",
(PyCFunction) StringMatchInstance_plaintext,
METH_NOARGS,
"Return matched data after xor key applied."
},
{ NULL },
};
static PyTypeObject StringMatchInstance_Type = {
PyVarObject_HEAD_INIT(NULL, 0)
"yara.StringMatchInstance", /*tp_name*/
sizeof(StringMatchInstance), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)StringMatchInstance_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
StringMatchInstance_repr, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
StringMatchInstance_hash, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
StringMatchInstance_getattro, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
"StringMatchInstance class", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */ // XXX: Implement richcompare?
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
StringMatchInstance_methods, /* tp_methods */
StringMatchInstance_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
0, /* tp_alloc */
0, /* tp_new */
};
// Rule object
typedef struct
......@@ -900,12 +1125,13 @@ int yara_callback(
const char* tag;
PyObject* tag_list = NULL;
PyObject* string_instance_list = NULL;
PyObject* string_list = NULL;
PyObject* meta_list = NULL;
PyObject* string_match_instance = NULL;
PyObject* match;
PyObject* callback_dict;
PyObject* object;
PyObject* tuple;
PyObject* matches = ((CALLBACK_DATA*) user_data)->matches;
PyObject* callback = ((CALLBACK_DATA*) user_data)->callback;
PyObject* callback_result;
......@@ -1009,21 +1235,45 @@ int yara_callback(
yr_rule_strings_foreach(rule, string)
{
// If this string is not a match, skip it. We have to check for this here
// and not rely on it in yr_string_matches_foreach macro because we need
// to create the string match instance list before we make the items that
// go in it.
if (context->matches[string->idx].head == NULL)
continue;
string_instance_list = PyList_New(0);
if (string_instance_list == NULL)
return PyErr_Format(PyExc_TypeError, "Out of memory");
yr_string_matches_foreach(context, string, m)
{
object = PyBytes_FromStringAndSize((char*) m->data, m->data_length);
tuple = Py_BuildValue(
"(L,s,O)",
string_match_instance = StringMatchInstance_NEW(
m->base + m->offset,
string->identifier,
object);
object,
m->match_length,
m->xor_key);
if (string_match_instance == NULL)
return PyErr_Format(PyExc_TypeError, "Out of memory");
PyList_Append(string_list, tuple);
PyList_Append(string_instance_list, string_match_instance);
Py_DECREF(object);
Py_DECREF(tuple);
Py_DECREF(string_match_instance);
}
object = StringMatch_NEW(
string->identifier,
string->flags,
string_instance_list);
if (object == NULL)
return PyErr_Format(PyExc_TypeError, "Out of memory");
Py_DECREF(string_instance_list);
PyList_Append(string_list, object);
Py_DECREF(object);
}
if (message == CALLBACK_MSG_RULE_MATCHING)
......@@ -1514,6 +1764,176 @@ static Py_hash_t Match_hash(
return PyObject_Hash(match->rule) + PyObject_Hash(match->ns);
}
////////////////////////////////////////////////////////////////////////////////
static PyObject* StringMatch_NEW(
const char* identifier,
uint64_t flags,
PyObject* instance_list)
{
StringMatch* object = PyObject_NEW(StringMatch, &StringMatch_Type);
if (object != NULL)
{
object->identifier = PY_STRING(identifier);
object->flags = flags;
object->instances = instance_list;
Py_INCREF(instance_list);
}
return (PyObject*) object;
}
static void StringMatch_dealloc(
PyObject* self)
{
StringMatch* object = (StringMatch*) self;
Py_DECREF(object->identifier);
Py_DECREF(object->instances);
PyObject_Del(self);
}
static PyObject* StringMatch_repr(
PyObject* self)
{
StringMatch* object = (StringMatch*) self;
Py_INCREF(object->identifier);
return object->identifier;
}
static PyObject* StringMatch_getattro(
PyObject* self,
PyObject* name)
{
return PyObject_GenericGetAttr(self, name);
}
// Hashing on just identifiers can be tricky as there can be duplicate
// identifiers between rules and there are anonymous strings too. Be careful
// when using this!
static Py_hash_t StringMatch_hash(
PyObject* self)
{
return PyObject_Hash(((StringMatch*) self)->identifier);
}
static PyObject* StringMatch_is_xor(
PyObject* self,
PyObject* args)
{
if (((StringMatch*) self)->flags & STRING_FLAGS_XOR)
Py_RETURN_TRUE;
Py_RETURN_FALSE;
}
////////////////////////////////////////////////////////////////////////////////
static PyObject* StringMatchInstance_NEW(
uint64_t offset,
PyObject* matched_data,
int32_t match_length,
uint8_t xor_key)
{
StringMatchInstance* object = PyObject_NEW(StringMatchInstance, &StringMatchInstance_Type);
if (object != NULL)
{
object->offset = PyLong_FromLongLong(offset);
object->matched_data = matched_data;
object->matched_length = PyLong_FromLong(match_length);
object->xor_key = PyLong_FromUnsignedLong((uint32_t) xor_key);
Py_INCREF(matched_data);
}
return (PyObject*) object;
}
static void StringMatchInstance_dealloc(
PyObject* self)
{
StringMatchInstance* object = (StringMatchInstance*) self;
Py_DECREF(object->offset);
Py_DECREF(object->matched_data);
Py_DECREF(object->xor_key);
PyObject_Del(self);
}
static PyObject* StringMatchInstance_repr(
PyObject* self)
{
StringMatchInstance* object = (StringMatchInstance*) self;
return PyCodec_Decode(object->matched_data, "utf-8", "backslashreplace");
}
static PyObject* StringMatchInstance_getattro(
PyObject* self,
PyObject* name)
{
return PyObject_GenericGetAttr(self, name);
}
static Py_hash_t StringMatchInstance_hash(
PyObject* self)
{
return PyObject_Hash(((StringMatchInstance*) self)->matched_data);
}
static PyObject* StringMatchInstance_plaintext(
PyObject* self,
PyObject* args)
{
char* pb;
Py_ssize_t length;
StringMatchInstance* instance = (StringMatchInstance*) self;
uint64_t xor_key = PyLong_AsUnsignedLongLong(instance->xor_key);
if (xor_key == 0)
return instance->matched_data;
int result = PyBytes_AsStringAndSize(instance->matched_data, &pb, &length);
if (result == -1)
return NULL;
// pb points to an internal buffer of the bytes object which we can not
// modify. Allocate a new buffer, copy the contents over and do the xor, then
// create a new bytes object to return.
uint8_t* buf = (uint8_t*) calloc(length, sizeof(uint8_t));
if (buf == NULL)
return PyErr_Format(PyExc_TypeError, "Out of memory");
memcpy(buf, pb, length);
for (size_t i = 0; i < length; i++) {
buf[i] = ((uint8_t) pb[i]) ^ xor_key;
}
PyObject* object = PyBytes_FromStringAndSize((char*) buf, length);
free(buf);
return object;
}
////////////////////////////////////////////////////////////////////////////////
......@@ -2745,11 +3165,19 @@ MOD_INIT(yara)
if (PyType_Ready(&Match_Type) < 0)
return MOD_ERROR_VAL;
if (PyType_Ready(&StringMatch_Type) < 0)
return MOD_ERROR_VAL;
if (PyType_Ready(&StringMatchInstance_Type) < 0)
return MOD_ERROR_VAL;
PyStructSequence_InitType(&RuleString_Type, &RuleString_Desc);
PyModule_AddObject(m, "Rule", (PyObject*) &Rule_Type);
PyModule_AddObject(m, "Rules", (PyObject*) &Rules_Type);
PyModule_AddObject(m, "Match", (PyObject*) &Match_Type);
PyModule_AddObject(m, "StringMatch", (PyObject*) &StringMatch_Type);
PyModule_AddObject(m, "StringMatchInstance", (PyObject*) &StringMatchInstance_Type);
PyModule_AddObject(m, "Error", YaraError);
PyModule_AddObject(m, "SyntaxError", YaraSyntaxError);
......@@ -2762,6 +3190,29 @@ MOD_INIT(yara)
return MOD_ERROR_VAL;
}
PyObject* module_names_list = PyList_New(0);
if (module_names_list == NULL)
{
PyErr_SetString(YaraError, "module list error");
return MOD_ERROR_VAL;
}
for (YR_MODULE* module = yr_modules_get_table(); module->name != NULL; module++)
{
PyObject* module_name = PY_STRING(module->name);
if (module_name == NULL)
{
PyErr_SetString(YaraError, "module name error");
return MOD_ERROR_VAL;
}
if (PyList_Append(module_names_list, module_name) < 0)
{
PyErr_SetString(YaraError, "module name error");
return MOD_ERROR_VAL;
}
}
PyModule_AddObject(m, "modules", module_names_list);
Py_AtExit(finalize);
return MOD_SUCCESS_VAL(m);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment