Unverified Commit 9a0ae7a3 by Enkelmann Committed by GitHub

IR refactoring (#205)

parent 3a25050e
use super::{Def, Jmp};
use crate::prelude::*;
use crate::utils::log::LogMessage;
use std::collections::HashSet;
/// A basic block is a sequence of `Def` instructions followed by up to two `Jmp` instructions.
///
/// The `Def` instructions represent side-effectful operations that are executed in order when the block is entered.
/// `Def` instructions do not affect the control flow of a program.
///
/// The `Jmp` instructions represent control flow affecting operations.
/// There can only be zero, one or two `Jmp`s:
/// - Zero `Jmp`s indicate that the next execution to be executed could not be discerned.
/// This should only happen on disassembler errors or on dead ends in the control flow graph that were deliberately inserted by the user.
/// - If there is exactly one `Jmp`, it is required to be an unconditional jump.
/// - For two jumps, the first one has to be a conditional jump,
/// where the second unconditional jump is only taken if the condition of the first jump evaluates to false.
///
/// If one of the `Jmp` instructions is an indirect jump,
/// then the `indirect_jmp_targets` is a list of possible jump target addresses for that jump.
/// The list may not be complete and the entries are not guaranteed to be correct.
///
/// Basic blocks are *single entry, single exit*, i.e. a basic block is only entered at the beginning
/// and is only exited by the jump instructions at the end of the block.
/// If a new control flow edge is discovered that would jump to the middle of a basic block,
/// the block structure needs to be updated accordingly.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Blk {
/// The `Def` instructions of the basic block in order of execution.
pub defs: Vec<Term<Def>>,
/// The `Jmp` instructions of the basic block
pub jmps: Vec<Term<Jmp>>,
/// If the basic block contains an indirect jump,
/// this field contains possible jump target addresses for the jump.
///
/// Note that possible targets of indirect calls are *not* contained,
/// since the [`Project::make_block_to_sub_mapping_unique`] normalization pass assumes
/// that only intraprocedural jump targets are contained in this field.
pub indirect_jmp_targets: Vec<Tid>,
}
impl Term<Blk> {
/// Remove indirect jump target addresses for which no corresponding target block exists.
/// Return an error message for each removed address.
pub fn remove_nonexisting_indirect_jump_targets(
&mut self,
known_block_tids: &HashSet<Tid>,
) -> Result<(), Vec<LogMessage>> {
let mut logs = Vec::new();
self.term.indirect_jmp_targets = self
.term
.indirect_jmp_targets
.iter()
.filter_map(|target| {
if known_block_tids.get(&target).is_some() {
Some(target.clone())
} else {
let error_msg =
format!("Indirect jump target at {} does not exist", target.address);
logs.push(LogMessage::new_error(error_msg).location(self.tid.clone()));
None
}
})
.collect();
if logs.is_empty() {
Ok(())
} else {
Err(logs)
}
}
/// Wherever possible, substitute input variables of expressions
/// with the input expression that defines the input variable.
///
/// Note that substitution is only possible
/// if the input variables of the input expression itself did not change since the definition of said variable.
///
/// The expression propagation allows the [`Project::substitute_trivial_expressions`] normalization pass
/// to further simplify the generated expressions
/// and allows more dead stores to be removed during [dead variable elimination](`crate::analysis::dead_variable_elimination`).
pub fn propagate_input_expressions(&mut self) {
let mut insertable_expressions = Vec::new();
for def in self.term.defs.iter_mut() {
match &mut def.term {
Def::Assign {
var,
value: expression,
} => {
// insert known input expressions
for (input_var, input_expr) in insertable_expressions.iter() {
expression.substitute_input_var(input_var, input_expr);
}
// expressions dependent on the assigned variable are no longer insertable
insertable_expressions.retain(|(input_var, input_expr)| {
input_var != var && !input_expr.input_vars().into_iter().any(|x| x == var)
});
// If the value of the assigned variable does not depend on the former value of the variable,
// then it is insertable for future expressions.
if !expression.input_vars().into_iter().any(|x| x == var) {
insertable_expressions.push((var.clone(), expression.clone()));
}
}
Def::Load {
var,
address: expression,
} => {
// insert known input expressions
for (input_var, input_expr) in insertable_expressions.iter() {
expression.substitute_input_var(input_var, input_expr);
}
// expressions dependent on the assigned variable are no longer insertable
insertable_expressions.retain(|(input_var, input_expr)| {
input_var != var && !input_expr.input_vars().into_iter().any(|x| x == var)
});
}
Def::Store { address, value } => {
// insert known input expressions
for (input_var, input_expr) in insertable_expressions.iter() {
address.substitute_input_var(input_var, input_expr);
value.substitute_input_var(input_var, input_expr);
}
}
}
}
for jump in self.term.jmps.iter_mut() {
match &mut jump.term {
Jmp::Branch(_) | Jmp::Call { .. } | Jmp::CallOther { .. } => (),
Jmp::BranchInd(expr)
| Jmp::CBranch {
condition: expr, ..
}
| Jmp::CallInd { target: expr, .. }
| Jmp::Return(expr) => {
// insert known input expressions
for (input_var, input_expr) in insertable_expressions.iter() {
expr.substitute_input_var(input_var, input_expr);
}
}
}
}
}
/// Merge subsequent assignments to the same variable to a single assignment to that variable.
///
/// The value expressions of merged assignments can often be simplified later on
/// in the [`Project::substitute_trivial_expressions`] normalization pass.
pub fn merge_def_assignments_to_same_var(&mut self) {
let mut new_defs = Vec::new();
let mut last_def_opt = None;
for def in self.term.defs.iter() {
if let Def::Assign {
var: current_var, ..
} = &def.term
{
if let Some(Term {
term:
Def::Assign {
var: last_var,
value: last_value,
},
..
}) = &last_def_opt
{
if current_var == last_var {
let mut substituted_def = def.clone();
substituted_def.substitute_input_var(last_var, last_value);
last_def_opt = Some(substituted_def);
} else {
new_defs.push(last_def_opt.unwrap());
last_def_opt = Some(def.clone());
}
} else if last_def_opt.is_some() {
panic!(); // Only assign-defs should be saved in last_def.
} else {
last_def_opt = Some(def.clone());
}
} else {
if let Some(last_def) = last_def_opt {
new_defs.push(last_def);
}
new_defs.push(def.clone());
last_def_opt = None;
}
}
if let Some(last_def) = last_def_opt {
new_defs.push(last_def);
}
self.term.defs = new_defs;
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::intermediate_representation::{Def, Expression, Variable};
impl Blk {
pub fn mock() -> Term<Blk> {
Term {
tid: Tid::new("block"),
term: Blk {
defs: Vec::new(),
jmps: Vec::new(),
indirect_jmp_targets: Vec::new(),
},
}
}
pub fn mock_with_tid(tid: &str) -> Term<Blk> {
Term {
tid: Tid::new(tid),
term: Blk {
defs: Vec::new(),
jmps: Vec::new(),
indirect_jmp_targets: Vec::new(),
},
}
}
}
#[test]
fn expression_propagation() {
use crate::intermediate_representation::UnOpType;
let defs = vec![
Def::assign(
"tid_1",
Variable::mock("X", 8),
Expression::var("Y", 8).un_op(UnOpType::IntNegate),
),
Def::assign(
"tid_2",
Variable::mock("Y", 8),
Expression::var("X", 8).plus(Expression::var("Y", 8)),
),
Def::assign(
"tid_3",
Variable::mock("X", 8),
Expression::var("X", 8).un_op(UnOpType::IntNegate),
),
Def::assign(
"tid_4",
Variable::mock("Y", 8),
Expression::var("Y", 8).un_op(UnOpType::IntNegate),
),
Def::assign(
"tid_5",
Variable::mock("Y", 8),
Expression::var("X", 8).plus(Expression::var("Y", 8)),
),
];
let mut block = Term {
tid: Tid::new("block"),
term: Blk {
defs,
jmps: Vec::new(),
indirect_jmp_targets: Vec::new(),
},
};
block.merge_def_assignments_to_same_var();
block.propagate_input_expressions();
let result_defs = vec![
Def::assign(
"tid_1",
Variable::mock("X", 8),
Expression::var("Y", 8).un_op(UnOpType::IntNegate),
),
Def::assign(
"tid_2",
Variable::mock("Y", 8),
Expression::var("Y", 8)
.un_op(UnOpType::IntNegate)
.plus(Expression::var("Y", 8)),
),
Def::assign(
"tid_3",
Variable::mock("X", 8),
Expression::var("X", 8).un_op(UnOpType::IntNegate),
),
Def::assign(
"tid_5",
Variable::mock("Y", 8),
Expression::var("X", 8).plus(Expression::var("Y", 8).un_op(UnOpType::IntNegate)),
),
];
assert_eq!(block.term.defs, result_defs);
}
}
use super::{CastOpType, Expression, Variable};
use crate::prelude::*;
/// A side-effectful operation.
/// Can be a register assignment or a memory load/store operation.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub enum Def {
/// A memory load into the register given by `var`.
Load {
/// The target register of the memory load.
/// The size of `var` also determines the number of bytes read from memory.
var: Variable,
/// The expression computing the address from which to read from.
/// The size of `address` is required to match the pointer size of the corresponding CPU architecture.
address: Expression,
},
/// A memory store operation.
Store {
/// The expression computing the address that is written to.
/// The size of `address` is required to match the pointer size of the corresponding CPU architecture.
address: Expression,
/// The expression computing the value that is written to memory.
/// The size of `value` also determines the number of bytes written.
value: Expression,
},
/// A register assignment, assigning the result of the expression `value` to the register `var`.
Assign {
/// The register that is written to.
var: Variable,
/// The expression computing the value that is assigned to the register.
value: Expression,
},
}
impl Term<Def> {
/// This function checks whether the instruction
/// is a zero extension of the overwritten sub register of the previous instruction.
/// If so, returns its TID
pub fn check_for_zero_extension(
&self,
output_name: String,
output_sub_register: String,
) -> Option<Tid> {
match &self.term {
Def::Assign {
var,
value:
Expression::Cast {
op: CastOpType::IntZExt,
arg,
..
},
} if output_name == var.name => {
let argument: &Expression = arg;
match argument {
Expression::Var(var) if var.name == output_sub_register => {
Some(self.tid.clone())
}
_ => None,
}
}
_ => None,
}
}
/// Substitute every occurence of `input_var` in the address and value expressions
/// with `replace_with_expression`.
/// Does not change the target variable of assignment- and load-instructions.
pub fn substitute_input_var(
&mut self,
input_var: &Variable,
replace_with_expression: &Expression,
) {
match &mut self.term {
Def::Assign { var: _, value } => {
value.substitute_input_var(input_var, replace_with_expression)
}
Def::Load { var: _, address } => {
address.substitute_input_var(input_var, replace_with_expression)
}
Def::Store { address, value } => {
address.substitute_input_var(input_var, replace_with_expression);
value.substitute_input_var(input_var, replace_with_expression);
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::intermediate_representation::BinOpType;
#[test]
fn zero_extension_check() {
let eax_variable = Expression::Var(Variable {
name: String::from("EAX"),
size: ByteSize::new(4),
is_temp: false,
});
let int_sub_expr = Expression::BinOp {
op: BinOpType::IntSub,
lhs: Box::new(Expression::Var(Variable {
name: String::from("EAX"),
size: ByteSize::new(4),
is_temp: false,
})),
rhs: Box::new(Expression::Var(Variable {
name: String::from("ECX"),
size: ByteSize::new(4),
is_temp: false,
})),
};
let zero_extend_def = Term {
tid: Tid::new("zero_tid"),
term: Def::Assign {
var: Variable {
name: String::from("RAX"),
size: ByteSize::new(8),
is_temp: false,
},
value: Expression::Cast {
op: CastOpType::IntZExt,
size: ByteSize::new(8),
arg: Box::new(eax_variable.clone()),
},
},
};
// An expression that is a zero extension but does not directly contain a variable
let zero_extend_but_no_var_def = Term {
tid: Tid::new("zero_tid"),
term: Def::Assign {
var: Variable {
name: String::from("RAX"),
size: ByteSize::new(8),
is_temp: false,
},
value: Expression::Cast {
op: CastOpType::IntZExt,
size: ByteSize::new(8),
arg: Box::new(int_sub_expr.clone()),
},
},
};
let non_zero_extend_def = Term {
tid: Tid::new("zero_tid"),
term: Def::Assign {
var: Variable {
name: String::from("RAX"),
size: ByteSize::new(8),
is_temp: false,
},
value: Expression::Cast {
op: CastOpType::IntSExt,
size: ByteSize::new(8),
arg: Box::new(eax_variable.clone()),
},
},
};
assert_eq!(
zero_extend_def.check_for_zero_extension(String::from("RAX"), String::from("EAX")),
Some(Tid::new("zero_tid"))
);
assert_eq!(
zero_extend_but_no_var_def
.check_for_zero_extension(String::from("RAX"), String::from("EAX")),
None
);
assert_eq!(
non_zero_extend_def.check_for_zero_extension(String::from("RAX"), String::from("EAX")),
None
);
}
}
......@@ -5,6 +5,7 @@ use super::{ByteSize, Def};
use crate::{pcode::RegisterProperties, prelude::*};
mod builder;
mod trivial_operation_substitution;
/// An expression is a calculation rule
/// on how to compute a certain value given some variables (register values) as input.
......@@ -75,6 +76,77 @@ pub enum Expression {
},
}
/// The type/mnemonic of a binary operation.
/// See the Ghidra P-Code documentation for more information.
#[allow(missing_docs)]
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, Copy)]
pub enum BinOpType {
Piece,
IntEqual,
IntNotEqual,
IntLess,
IntSLess,
IntLessEqual,
IntSLessEqual,
IntAdd,
IntSub,
IntCarry,
IntSCarry,
IntSBorrow,
IntXOr,
IntAnd,
IntOr,
IntLeft,
IntRight,
IntSRight,
IntMult,
IntDiv,
IntRem,
IntSDiv,
IntSRem,
BoolXOr,
BoolAnd,
BoolOr,
FloatEqual,
FloatNotEqual,
FloatLess,
FloatLessEqual,
FloatAdd,
FloatSub,
FloatMult,
FloatDiv,
}
/// The type/mnemonic of a typecast
/// See the Ghidra P-Code documentation for more information.
#[allow(missing_docs)]
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, Copy)]
pub enum CastOpType {
IntZExt,
IntSExt,
Int2Float,
Float2Float,
Trunc,
PopCount,
}
/// The type/mnemonic of an unary operation
/// See the Ghidra P-Code documentation for more information.
#[allow(missing_docs)]
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, Copy)]
pub enum UnOpType {
IntNegate,
Int2Comp,
BoolNegate,
FloatNegate,
FloatAbs,
FloatSqrt,
FloatCeil,
FloatFloor,
FloatRound,
FloatNaN,
}
impl Expression {
/// Return the size (in bytes) of the result value of the expression.
pub fn bytesize(&self) -> ByteSize {
......@@ -116,295 +188,6 @@ impl Expression {
}
}
/// Substitute trivial BinOp-expressions with their results,
/// e.g. substitute `a or a` with `a`.
///
/// This function assumes that `self` is a `BinOp`
/// and it does not substitute trivial expressions in the two input expressions of the `BinOp`.
fn substitute_trivial_binops(&mut self) {
use BinOpType::*;
use Expression::*;
if let BinOp { op, lhs, rhs } = self {
if lhs == rhs {
match op {
BoolAnd | BoolOr | IntAnd | IntOr => {
// This is an identity operation
*self = (**lhs).clone();
}
BoolXOr | IntXOr => {
// `a xor a` always equals zero.
*self = Expression::Const(Bitvector::zero(lhs.bytesize().into()));
}
IntEqual | IntLessEqual | IntSLessEqual => {
*self = Expression::Const(Bitvector::one(ByteSize::new(1).into()));
}
IntNotEqual | IntLess | IntSLess => {
*self = Expression::Const(Bitvector::zero(ByteSize::new(1).into()));
}
_ => (),
}
} else {
match (&**lhs, op, &**rhs) {
(Const(bitvec), op, other) | (other, op, Const(bitvec))
if bitvec.is_zero() && matches!(op, IntOr | IntXOr | BoolOr | BoolXOr) =>
{
// `a or 0 = a` and `a xor 0 = a`
*self = other.clone();
}
(Const(bitvec), op, other) | (other, op, Const(bitvec))
if bitvec.clone().into_bitnot().is_zero()
&& matches!(op, IntAnd | BoolAnd) =>
{
// `a and -1 = a` since all bits of -1 are 1.
*self = other.clone()
}
(
Const(bitvec),
op,
Expression::BinOp {
lhs: inner_lhs,
op: IntSub,
rhs: inner_rhs,
},
)
| (
Expression::BinOp {
lhs: inner_lhs,
op: IntSub,
rhs: inner_rhs,
},
op,
Const(bitvec),
) if (bitvec.is_zero() || bitvec.is_one())
&& matches!(op, IntEqual | IntNotEqual) =>
{
// `0 == x - y` is equivalent to `x == y`
let new_op = match (op, bitvec.is_zero()) {
(IntEqual, true) | (IntNotEqual, false) => IntEqual,
(IntEqual, false) | (IntNotEqual, true) => IntNotEqual,
_ => unreachable!(),
};
*self = Expression::BinOp {
lhs: inner_lhs.clone(),
op: new_op,
rhs: inner_rhs.clone(),
}
}
(
Expression::BinOp {
lhs: less_left,
op: IntSLess,
rhs: less_right,
},
BoolOr,
Expression::BinOp {
lhs: equal_left,
op: IntEqual,
rhs: equal_right,
},
)
| (
Expression::BinOp {
lhs: equal_left,
op: IntEqual,
rhs: equal_right,
},
BoolOr,
Expression::BinOp {
lhs: less_left,
op: IntSLess,
rhs: less_right,
},
) if (less_left == equal_left && less_right == equal_right)
|| (less_left == equal_right && less_right == equal_left) =>
{
// `x < y or x == y` is equivalent to `x <= y `
*self = Expression::BinOp {
lhs: less_left.clone(),
op: IntSLessEqual,
rhs: less_right.clone(),
};
}
(
Expression::BinOp {
lhs: less_left,
op: IntLess,
rhs: less_right,
},
BoolOr,
Expression::BinOp {
lhs: equal_left,
op: IntEqual,
rhs: equal_right,
},
)
| (
Expression::BinOp {
lhs: equal_left,
op: IntEqual,
rhs: equal_right,
},
BoolOr,
Expression::BinOp {
lhs: less_left,
op: IntLess,
rhs: less_right,
},
) if (less_left == equal_left && less_right == equal_right)
|| (less_left == equal_right && less_right == equal_left) =>
{
// `x < y or x == y` is equivalent to `x <= y `
*self = Expression::BinOp {
lhs: less_left.clone(),
op: IntLessEqual,
rhs: less_right.clone(),
};
}
_ => (),
}
}
}
}
/// Substitute some trivial expressions with their result.
/// E.g. substitute `a XOR a` with zero or substitute `a OR a` with `a`.
pub fn substitute_trivial_operations(&mut self) {
use Expression::*;
match self {
Var(_) | Const(_) | Unknown { .. } => (),
Subpiece {
low_byte,
size,
arg,
} => {
arg.substitute_trivial_operations();
if *low_byte == ByteSize::new(0) && *size == arg.bytesize() {
*self = (**arg).clone();
} else {
match &**arg {
Expression::Cast {
arg: inner_arg,
op: CastOpType::IntZExt,
..
}
| Expression::Cast {
arg: inner_arg,
op: CastOpType::IntSExt,
..
} if *low_byte == ByteSize::new(0) && *size == inner_arg.bytesize() => {
// The zero or sign extended part is thrown away by the subpiece ooperation.
*self = (**inner_arg).clone();
}
Expression::BinOp {
op: BinOpType::Piece,
lhs,
rhs,
} => {
// If the subpiece extracts exactly the `lhs` or the `rhs` of the piece operation,
// we can simplify to just `lhs` or `rhs`.
if *low_byte == rhs.bytesize() && *size == lhs.bytesize() {
*self = (**lhs).clone();
} else if *low_byte == ByteSize::new(0) && *size == rhs.bytesize() {
*self = (**rhs).clone();
}
}
Expression::Subpiece {
low_byte: inner_low_byte,
size: _,
arg: inner_arg,
} => {
// Subpiece of subpiece can be simplified to a single subpiece operation.
*self = Expression::Subpiece {
low_byte: *low_byte + *inner_low_byte,
size: *size,
arg: (*inner_arg).clone(),
}
}
_ => (),
}
}
}
Cast { op, size, arg } => {
arg.substitute_trivial_operations();
if (*op == CastOpType::IntSExt || *op == CastOpType::IntZExt)
&& *size == arg.bytesize()
{
*self = (**arg).clone();
} else if *op == CastOpType::IntSExt || *op == CastOpType::IntZExt {
match &**arg {
Expression::Cast {
op: inner_op,
size: _,
arg: inner_arg,
} if *op == *inner_op => {
// Merge two zero/sign-extension to one.
*self = Expression::Cast {
op: *op,
size: *size,
arg: inner_arg.clone(),
};
}
_ => (),
}
}
}
UnOp { op, arg } => {
arg.substitute_trivial_operations();
match &**arg {
Expression::UnOp {
op: inner_op,
arg: inner_arg,
} if op == inner_op
&& matches!(
op,
UnOpType::IntNegate | UnOpType::BoolNegate | UnOpType::Int2Comp
) =>
{
*self = (**inner_arg).clone();
}
Expression::BinOp {
lhs: inner_lhs,
op: inner_op,
rhs: inner_rhs,
} if *op == UnOpType::BoolNegate
&& matches!(
inner_op,
BinOpType::IntEqual
| BinOpType::IntNotEqual
| BinOpType::IntLess
| BinOpType::IntSLess
| BinOpType::IntLessEqual
| BinOpType::IntSLessEqual
) =>
{
// `!( x < y)` is equivalent to ` y <= x`
let new_op = match inner_op {
BinOpType::IntEqual => BinOpType::IntNotEqual,
BinOpType::IntNotEqual => BinOpType::IntEqual,
BinOpType::IntLess => BinOpType::IntLessEqual,
BinOpType::IntSLess => BinOpType::IntSLessEqual,
BinOpType::IntLessEqual => BinOpType::IntLess,
BinOpType::IntSLessEqual => BinOpType::IntSLess,
_ => unreachable!(),
};
// Note that we have to swap the left hand side with the right hand side of the binary expression.
*self = Expression::BinOp {
lhs: inner_rhs.clone(),
op: new_op,
rhs: inner_lhs.clone(),
};
}
_ => (),
}
}
BinOp { op: _, lhs, rhs } => {
lhs.substitute_trivial_operations();
rhs.substitute_trivial_operations();
self.substitute_trivial_binops();
}
}
}
/// Substitute every occurence of `input_var` in `self` with the given `replace_with_expression`.
pub fn substitute_input_var(
&mut self,
......@@ -642,76 +425,5 @@ impl Expression {
}
}
/// The type/mnemonic of a binary operation.
/// See the Ghidra P-Code documentation for more information.
#[allow(missing_docs)]
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, Copy)]
pub enum BinOpType {
Piece,
IntEqual,
IntNotEqual,
IntLess,
IntSLess,
IntLessEqual,
IntSLessEqual,
IntAdd,
IntSub,
IntCarry,
IntSCarry,
IntSBorrow,
IntXOr,
IntAnd,
IntOr,
IntLeft,
IntRight,
IntSRight,
IntMult,
IntDiv,
IntRem,
IntSDiv,
IntSRem,
BoolXOr,
BoolAnd,
BoolOr,
FloatEqual,
FloatNotEqual,
FloatLess,
FloatLessEqual,
FloatAdd,
FloatSub,
FloatMult,
FloatDiv,
}
/// The type/mnemonic of a typecast
/// See the Ghidra P-Code documentation for more information.
#[allow(missing_docs)]
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, Copy)]
pub enum CastOpType {
IntZExt,
IntSExt,
Int2Float,
Float2Float,
Trunc,
PopCount,
}
/// The type/mnemonic of an unary operation
/// See the Ghidra P-Code documentation for more information.
#[allow(missing_docs)]
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, Copy)]
pub enum UnOpType {
IntNegate,
Int2Comp,
BoolNegate,
FloatNegate,
FloatAbs,
FloatSqrt,
FloatCeil,
FloatFloor,
FloatRound,
FloatNaN,
}
#[cfg(test)]
mod tests;
use super::*;
impl Expression {
/// Substitute trivial BinOp-expressions with their results,
/// e.g. substitute `a or a` with `a`.
///
/// This function assumes that `self` is a `BinOp`
/// and it does not substitute trivial expressions in the two input expressions of the `BinOp`.
fn substitute_trivial_binops(&mut self) {
use BinOpType::*;
use Expression::*;
if let BinOp { op, lhs, rhs } = self {
if lhs == rhs {
match op {
BoolAnd | BoolOr | IntAnd | IntOr => {
// This is an identity operation
*self = (**lhs).clone();
}
BoolXOr | IntXOr => {
// `a xor a` always equals zero.
*self = Expression::Const(Bitvector::zero(lhs.bytesize().into()));
}
IntEqual | IntLessEqual | IntSLessEqual => {
*self = Expression::Const(Bitvector::one(ByteSize::new(1).into()));
}
IntNotEqual | IntLess | IntSLess => {
*self = Expression::Const(Bitvector::zero(ByteSize::new(1).into()));
}
_ => (),
}
} else {
match (&**lhs, op, &**rhs) {
(Const(bitvec), op, other) | (other, op, Const(bitvec))
if bitvec.is_zero() && matches!(op, IntOr | IntXOr | BoolOr | BoolXOr) =>
{
// `a or 0 = a` and `a xor 0 = a`
*self = other.clone();
}
(Const(bitvec), op, other) | (other, op, Const(bitvec))
if bitvec.clone().into_bitnot().is_zero()
&& matches!(op, IntAnd | BoolAnd) =>
{
// `a and -1 = a` since all bits of -1 are 1.
*self = other.clone()
}
(
Const(bitvec),
op,
Expression::BinOp {
lhs: inner_lhs,
op: IntSub,
rhs: inner_rhs,
},
)
| (
Expression::BinOp {
lhs: inner_lhs,
op: IntSub,
rhs: inner_rhs,
},
op,
Const(bitvec),
) if (bitvec.is_zero() || bitvec.is_one())
&& matches!(op, IntEqual | IntNotEqual) =>
{
// `0 == x - y` is equivalent to `x == y`
let new_op = match (op, bitvec.is_zero()) {
(IntEqual, true) | (IntNotEqual, false) => IntEqual,
(IntEqual, false) | (IntNotEqual, true) => IntNotEqual,
_ => unreachable!(),
};
*self = Expression::BinOp {
lhs: inner_lhs.clone(),
op: new_op,
rhs: inner_rhs.clone(),
}
}
(
Expression::BinOp {
lhs: less_left,
op: IntSLess,
rhs: less_right,
},
BoolOr,
Expression::BinOp {
lhs: equal_left,
op: IntEqual,
rhs: equal_right,
},
)
| (
Expression::BinOp {
lhs: equal_left,
op: IntEqual,
rhs: equal_right,
},
BoolOr,
Expression::BinOp {
lhs: less_left,
op: IntSLess,
rhs: less_right,
},
) if (less_left == equal_left && less_right == equal_right)
|| (less_left == equal_right && less_right == equal_left) =>
{
// `x < y or x == y` is equivalent to `x <= y `
*self = Expression::BinOp {
lhs: less_left.clone(),
op: IntSLessEqual,
rhs: less_right.clone(),
};
}
(
Expression::BinOp {
lhs: less_left,
op: IntLess,
rhs: less_right,
},
BoolOr,
Expression::BinOp {
lhs: equal_left,
op: IntEqual,
rhs: equal_right,
},
)
| (
Expression::BinOp {
lhs: equal_left,
op: IntEqual,
rhs: equal_right,
},
BoolOr,
Expression::BinOp {
lhs: less_left,
op: IntLess,
rhs: less_right,
},
) if (less_left == equal_left && less_right == equal_right)
|| (less_left == equal_right && less_right == equal_left) =>
{
// `x < y or x == y` is equivalent to `x <= y `
*self = Expression::BinOp {
lhs: less_left.clone(),
op: IntLessEqual,
rhs: less_right.clone(),
};
}
_ => (),
}
}
}
}
/// Substitute some trivial expressions with their result.
/// E.g. substitute `a XOR a` with zero or substitute `a OR a` with `a`.
pub fn substitute_trivial_operations(&mut self) {
use Expression::*;
match self {
Var(_) | Const(_) | Unknown { .. } => (),
Subpiece {
low_byte,
size,
arg,
} => {
arg.substitute_trivial_operations();
if *low_byte == ByteSize::new(0) && *size == arg.bytesize() {
*self = (**arg).clone();
} else {
match &**arg {
Expression::Cast {
arg: inner_arg,
op: CastOpType::IntZExt,
..
}
| Expression::Cast {
arg: inner_arg,
op: CastOpType::IntSExt,
..
} if *low_byte == ByteSize::new(0) && *size == inner_arg.bytesize() => {
// The zero or sign extended part is thrown away by the subpiece ooperation.
*self = (**inner_arg).clone();
}
Expression::BinOp {
op: BinOpType::Piece,
lhs,
rhs,
} => {
// If the subpiece extracts exactly the `lhs` or the `rhs` of the piece operation,
// we can simplify to just `lhs` or `rhs`.
if *low_byte == rhs.bytesize() && *size == lhs.bytesize() {
*self = (**lhs).clone();
} else if *low_byte == ByteSize::new(0) && *size == rhs.bytesize() {
*self = (**rhs).clone();
}
}
Expression::Subpiece {
low_byte: inner_low_byte,
size: _,
arg: inner_arg,
} => {
// Subpiece of subpiece can be simplified to a single subpiece operation.
*self = Expression::Subpiece {
low_byte: *low_byte + *inner_low_byte,
size: *size,
arg: (*inner_arg).clone(),
}
}
_ => (),
}
}
}
Cast { op, size, arg } => {
arg.substitute_trivial_operations();
if (*op == CastOpType::IntSExt || *op == CastOpType::IntZExt)
&& *size == arg.bytesize()
{
*self = (**arg).clone();
} else if *op == CastOpType::IntSExt || *op == CastOpType::IntZExt {
match &**arg {
Expression::Cast {
op: inner_op,
size: _,
arg: inner_arg,
} if *op == *inner_op => {
// Merge two zero/sign-extension to one.
*self = Expression::Cast {
op: *op,
size: *size,
arg: inner_arg.clone(),
};
}
_ => (),
}
}
}
UnOp { op, arg } => {
arg.substitute_trivial_operations();
match &**arg {
Expression::UnOp {
op: inner_op,
arg: inner_arg,
} if op == inner_op
&& matches!(
op,
UnOpType::IntNegate | UnOpType::BoolNegate | UnOpType::Int2Comp
) =>
{
*self = (**inner_arg).clone();
}
Expression::BinOp {
lhs: inner_lhs,
op: inner_op,
rhs: inner_rhs,
} if *op == UnOpType::BoolNegate
&& matches!(
inner_op,
BinOpType::IntEqual
| BinOpType::IntNotEqual
| BinOpType::IntLess
| BinOpType::IntSLess
| BinOpType::IntLessEqual
| BinOpType::IntSLessEqual
) =>
{
// `!( x < y)` is equivalent to ` y <= x`
let new_op = match inner_op {
BinOpType::IntEqual => BinOpType::IntNotEqual,
BinOpType::IntNotEqual => BinOpType::IntEqual,
BinOpType::IntLess => BinOpType::IntLessEqual,
BinOpType::IntSLess => BinOpType::IntSLessEqual,
BinOpType::IntLessEqual => BinOpType::IntLess,
BinOpType::IntSLessEqual => BinOpType::IntSLess,
_ => unreachable!(),
};
// Note that we have to swap the left hand side with the right hand side of the binary expression.
*self = Expression::BinOp {
lhs: inner_rhs.clone(),
op: new_op,
rhs: inner_lhs.clone(),
};
}
_ => (),
}
}
BinOp { op: _, lhs, rhs } => {
lhs.substitute_trivial_operations();
rhs.substitute_trivial_operations();
self.substitute_trivial_binops();
}
}
}
}
use super::Expression;
use crate::prelude::*;
/// A `Jmp` instruction affects the control flow of a program, i.e. it may change the instruction pointer.
/// With the exception of `CallOther`, it has no other side effects.
///
/// `Jmp` instructions carry some semantic information with it, like whether a jump is intra- or interprocedural.
/// Note that this semantic information may not always be correct.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub enum Jmp {
/// A direct intraprocedural jump to the targeted `Blk` term identifier.
Branch(Tid),
/// An indirect intraprocedural jump to the address that the given expression evaluates to.
BranchInd(Expression),
/// A direct intraprocedural jump that is only taken if the condition evaluates to true (i.e. not zero).
CBranch {
/// The term ID of the target block of the jump.
target: Tid,
/// The jump is only taken if this expression evaluates to `true`, (i.e. not zero).
condition: Expression,
},
/// A direct interprocedural jump representing a subroutine call.
///
/// Note that this is syntactically equivalent to a `Jmp::Branch`.
Call {
/// The term ID of the target subroutine (`Sub`) or extern symbol of the call.
target: Tid,
/// The term ID of the block that the called function returns to.
/// May be `None` if it is assumed that the called function never returns.
return_: Option<Tid>,
},
/// An indirect interprocedural jump to the address the `target` expression evaluates to
/// and representing a subroutine call.
///
/// Note that this is syntactically equivalent to a `Jmp::BranchInd`.
CallInd {
/// An expression computing the target address of the call.
target: Expression,
/// The term ID of the block that the called function returns to.
/// May be `None` if it is assumed that the called function never returns.
return_: Option<Tid>,
},
/// A indirect interprocedural jump indicating a return from a subroutine.
///
/// Note that this is syntactically equivalent to a `Jmp::BranchInd`.
Return(Expression),
/// This instruction is used for all side effects that are not representable by other instructions
/// or not supported by the disassembler.
///
/// E.g. syscalls and other interrupts are mapped to `CallOther`.
/// Assembly instructions that the disassembler does not support are also mapped to `CallOther`.
/// One can use the `description` field to match for and handle known side effects (e.g. syscalls).
CallOther {
/// A description of the side effect.
description: String,
/// The block term identifier of the block
/// where the disassembler assumes that execution will continue after handling of the side effect.
return_: Option<Tid>,
},
}
......@@ -18,6 +18,18 @@ mod expression;
pub use expression::*;
mod term;
pub use term::*;
mod def;
pub use def::*;
mod jmp;
pub use jmp::*;
mod blk;
pub use blk::*;
mod sub;
pub use sub::*;
mod program;
pub use program::*;
mod project;
pub use project::*;
/// An unsigned number of bytes.
///
......@@ -172,9 +184,24 @@ impl From<String> for Datatype {
#[cfg(test)]
mod tests {
use super::*;
use apint::BitWidth;
use super::*;
impl DatatypeProperties {
pub fn mock() -> DatatypeProperties {
DatatypeProperties {
char_size: ByteSize::new(1),
double_size: ByteSize::new(8),
float_size: ByteSize::new(4),
integer_size: ByteSize::new(4),
long_double_size: ByteSize::new(8),
long_long_size: ByteSize::new(8),
long_size: ByteSize::new(4),
pointer_size: ByteSize::new(8),
short_size: ByteSize::new(2),
}
}
}
#[test]
fn check_bit_to_byte_conversion() {
......
use super::{Blk, ExternSymbol, Sub};
use crate::prelude::*;
/// The `Program` structure represents a disassembled binary.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Program {
/// The known functions contained in the binary
pub subs: Vec<Term<Sub>>,
/// Extern symbols linked to the binary by the linker.
pub extern_symbols: Vec<ExternSymbol>,
/// Entry points into to binary,
/// i.e. the term identifiers of functions that may be called from outside of the binary.
pub entry_points: Vec<Tid>,
/// An offset that has been added to all addresses in the program compared to the addresses
/// as specified in the binary file.
///
/// In certain cases, e.g. if the binary specifies a segment to be loaded at address 0,
/// the Ghidra backend may shift the whole binary image by a constant value in memory.
/// Thus addresses as specified by the binary and addresses as reported by Ghidra may differ by a constant offset,
/// which is stored in this value.
pub address_base_offset: u64,
}
impl Program {
/// Find a block term by its term identifier.
/// WARNING: The function simply iterates through all blocks,
/// i.e. it is very inefficient for large projects!
pub fn find_block(&self, tid: &Tid) -> Option<&Term<Blk>> {
self.subs
.iter()
.map(|sub| sub.term.blocks.iter())
.flatten()
.find(|block| block.tid == *tid)
}
}
#[cfg(test)]
mod tests {
use super::*;
impl Program {
pub fn mock_empty() -> Program {
Program {
subs: Vec::new(),
extern_symbols: Vec::new(),
entry_points: Vec::new(),
address_base_offset: 0,
}
}
}
}
use super::{Blk, CallingConvention, DatatypeProperties, Def, Jmp, Program, Sub, Variable};
use crate::prelude::*;
use crate::utils::log::LogMessage;
use std::collections::{HashMap, HashSet};
mod block_duplication_normalization;
use block_duplication_normalization::*;
/// The `Project` struct is the main data structure representing a binary.
///
/// It contains information about the disassembled binary
/// and about the execution environment of the binary.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Project {
/// All (known) executable code of the binary is contained in the `program` term.
pub program: Term<Program>,
/// The CPU architecture on which the binary is assumed to be executed.
pub cpu_architecture: String,
/// The stack pointer register for the given CPU architecture.
pub stack_pointer_register: Variable,
/// The known calling conventions that may be used for calls to extern functions.
pub calling_conventions: Vec<CallingConvention>,
/// A list of all known physical registers for the CPU architecture.
/// Does only contain base registers, i.e. sub registers of other registers are not contained.
pub register_list: Vec<Variable>,
/// Contains the properties of C data types. (e.g. size)
pub datatype_properties: DatatypeProperties,
}
impl Project {
/// Return the size (in bytes) for pointers of the given architecture.
pub fn get_pointer_bytesize(&self) -> ByteSize {
self.stack_pointer_register.size
}
/// Try to guess a standard calling convention from the list of calling conventions in the project.
pub fn get_standard_calling_convention(&self) -> Option<&CallingConvention> {
self.calling_conventions
.iter()
.find(|cconv| cconv.name == "__stdcall" || cconv.name == "__cdecl")
}
}
impl Project {
/// For all expressions contained in the project,
/// replace trivially computable subexpressions like `a XOR a` with their result.
fn substitute_trivial_expressions(&mut self) {
for sub in self.program.term.subs.iter_mut() {
for block in sub.term.blocks.iter_mut() {
for def in block.term.defs.iter_mut() {
match &mut def.term {
Def::Assign { value: expr, .. } | Def::Load { address: expr, .. } => {
expr.substitute_trivial_operations()
}
Def::Store { address, value } => {
address.substitute_trivial_operations();
value.substitute_trivial_operations();
}
}
}
for jmp in block.term.jmps.iter_mut() {
match &mut jmp.term {
Jmp::Branch(_) | Jmp::Call { .. } | Jmp::CallOther { .. } => (),
Jmp::BranchInd(expr)
| Jmp::CBranch {
condition: expr, ..
}
| Jmp::CallInd { target: expr, .. }
| Jmp::Return(expr) => expr.substitute_trivial_operations(),
}
}
}
}
}
/// Replace jumps to nonexisting TIDs with jumps to a dummy target
/// representing an artificial sink in the control flow graph.
/// Return a log message for each replaced jump target.
///
/// Nonexisting jump targets may be generated by the Ghidra backend
/// if the data at the target address is not a valid assembly instruction.
#[must_use]
fn remove_references_to_nonexisting_tids(&mut self) -> Vec<LogMessage> {
// Gather all existing jump targets
let mut jump_target_tids = HashSet::new();
for sub in self.program.term.subs.iter() {
jump_target_tids.insert(sub.tid.clone());
for block in sub.term.blocks.iter() {
jump_target_tids.insert(block.tid.clone());
}
}
for symbol in self.program.term.extern_symbols.iter() {
jump_target_tids.insert(symbol.tid.clone());
}
// Replace all jumps to non-existing jump targets with jumps to dummy targets
let dummy_sub_tid = Tid::new("Artificial Sink Sub");
let dummy_blk_tid = Tid::new("Artificial Sink Block");
let mut log_messages = Vec::new();
for sub in self.program.term.subs.iter_mut() {
for block in sub.term.blocks.iter_mut() {
if let Err(mut logs) =
block.remove_nonexisting_indirect_jump_targets(&jump_target_tids)
{
log_messages.append(&mut logs);
}
for jmp in block.term.jmps.iter_mut() {
if let Err(log_msg) = jmp.retarget_nonexisting_jump_targets_to_dummy_tid(
&jump_target_tids,
&dummy_sub_tid,
&dummy_blk_tid,
) {
log_messages.push(log_msg);
}
}
}
}
// If at least one dummy jump was inserted, add the corresponding dummy sub and block to the program.
if !log_messages.is_empty() {
let dummy_sub: Term<Sub> = Term {
tid: dummy_sub_tid,
term: Sub {
name: "Artificial Sink Sub".to_string(),
blocks: vec![Term {
tid: dummy_blk_tid,
term: Blk {
defs: Vec::new(),
jmps: Vec::new(),
indirect_jmp_targets: Vec::new(),
},
}],
},
};
self.program.term.subs.push(dummy_sub);
}
log_messages
}
/// Propagate input expressions along variable assignments.
///
/// The propagation only occurs inside basic blocks
/// but not across basic block boundaries.
fn propagate_input_expressions(&mut self) {
for sub in self.program.term.subs.iter_mut() {
for block in sub.term.blocks.iter_mut() {
block.merge_def_assignments_to_same_var();
block.propagate_input_expressions();
}
}
}
/// Run some normalization passes over the project.
///
/// Passes:
/// - Replace jumps to nonexisting TIDs with jumps to artificial sink targets in the CFG.
/// - Duplicate blocks so that if a block is contained in several functions, each function gets its own unique copy.
/// - Propagate input expressions along variable assignments.
/// - Replace trivial expressions like `a XOR a` with their result.
/// - Remove dead register assignments
#[must_use]
pub fn normalize(&mut self) -> Vec<LogMessage> {
let logs = self.remove_references_to_nonexisting_tids();
make_block_to_sub_mapping_unique(self);
self.propagate_input_expressions();
self.substitute_trivial_expressions();
crate::analysis::dead_variable_elimination::remove_dead_var_assignments(self);
logs
}
}
impl Term<Jmp> {
/// If the TID of a jump target or return target is not contained in `known_tids`
/// replace it with a dummy TID and return an error message.
fn retarget_nonexisting_jump_targets_to_dummy_tid(
&mut self,
known_tids: &HashSet<Tid>,
dummy_sub_tid: &Tid,
dummy_blk_tid: &Tid,
) -> Result<(), LogMessage> {
use Jmp::*;
match &mut self.term {
BranchInd(_) => (),
Branch(tid) | CBranch { target: tid, .. } if known_tids.get(tid).is_none() => {
let error_msg = format!("Jump target at {} does not exist", tid.address);
let error_log = LogMessage::new_error(error_msg).location(self.tid.clone());
*tid = dummy_blk_tid.clone();
return Err(error_log);
}
Call { target, return_ } if known_tids.get(target).is_none() => {
let error_msg = format!("Call target at {} does not exist", target.address);
let error_log = LogMessage::new_error(error_msg).location(self.tid.clone());
*target = dummy_sub_tid.clone();
*return_ = None;
return Err(error_log);
}
Call {
return_: Some(return_tid),
..
}
| CallInd {
return_: Some(return_tid),
..
}
| CallOther {
return_: Some(return_tid),
..
} if known_tids.get(return_tid).is_none() => {
let error_msg = format!("Return target at {} does not exist", return_tid.address);
let error_log = LogMessage::new_error(error_msg).location(self.tid.clone());
*return_tid = dummy_blk_tid.clone();
return Err(error_log);
}
_ => (),
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
impl Project {
pub fn mock_empty() -> Project {
let register_list = vec!["RAX", "RCX", "RDX", "RBX", "RSP", "RBP", "RSI", "RDI"]
.into_iter()
.map(|name| Variable::mock(name, ByteSize::new(8)))
.collect();
Project {
program: Term {
tid: Tid::new("program_tid"),
term: Program::mock_empty(),
},
cpu_architecture: "x86_64".to_string(),
stack_pointer_register: Variable::mock("RSP", 8u64),
calling_conventions: Vec::new(),
register_list,
datatype_properties: DatatypeProperties::mock(),
}
}
}
#[test]
fn retarget_nonexisting_jumps() {
let mut jmp_term = Term {
tid: Tid::new("jmp"),
term: Jmp::Branch(Tid::new("nonexisting_target")),
};
assert_eq!(jmp_term.term, Jmp::Branch(Tid::new("nonexisting_target")));
assert!(jmp_term
.retarget_nonexisting_jump_targets_to_dummy_tid(
&HashSet::new(),
&Tid::new("dummy_sub"),
&Tid::new("dummy_blk")
)
.is_err());
assert_eq!(jmp_term.term, Jmp::Branch(Tid::new("dummy_blk")));
}
}
use super::*;
impl Term<Jmp> {
/// If the jump is intraprocedural, return its target TID.
/// If the jump is a call, return the TID of the return target.
fn get_intraprocedural_target_or_return_block_tid(&self) -> Option<Tid> {
match &self.term {
Jmp::BranchInd(_) | Jmp::Return(_) => None,
Jmp::Branch(tid) => Some(tid.clone()),
Jmp::CBranch { target, .. } => Some(target.clone()),
Jmp::Call { return_, .. }
| Jmp::CallInd { return_, .. }
| Jmp::CallOther { return_, .. } => return_.as_ref().cloned(),
}
}
}
impl Term<Blk> {
/// Return a clone of `self` where the given suffix is appended to
/// the TIDs of all contained terms (the block itself and all `Jmp`s and `Def`s).
///
/// Note that all TIDs of jump targets (direct, indirect and return targets) are left unchanged.
fn clone_with_tid_suffix(&self, suffix: &str) -> Self {
let mut cloned_block = self.clone();
cloned_block.tid = cloned_block.tid.with_id_suffix(suffix);
for def in cloned_block.term.defs.iter_mut() {
def.tid = def.tid.clone().with_id_suffix(suffix);
}
for jmp in cloned_block.term.jmps.iter_mut() {
jmp.tid = jmp.tid.clone().with_id_suffix(suffix);
}
cloned_block
}
}
impl Project {
/// Generate a map from all `Sub`, `Blk`, `Def` and `Jmp` TIDs of the project
/// to the `Sub` TID in which the term is contained.
fn generate_tid_to_sub_tid_map(&self) -> HashMap<Tid, Tid> {
let mut tid_to_sub_map = HashMap::new();
for sub in self.program.term.subs.iter() {
tid_to_sub_map.insert(sub.tid.clone(), sub.tid.clone());
for block in sub.term.blocks.iter() {
tid_to_sub_map.insert(block.tid.clone(), sub.tid.clone());
for def in block.term.defs.iter() {
tid_to_sub_map.insert(def.tid.clone(), sub.tid.clone());
}
for jmp in block.term.jmps.iter() {
tid_to_sub_map.insert(jmp.tid.clone(), sub.tid.clone());
}
}
}
tid_to_sub_map
}
/// Generate a map mapping all block TIDs to the corresponding block.
fn generate_block_tid_to_block_term_map(&self) -> HashMap<Tid, &Term<Blk>> {
let mut tid_to_block_map = HashMap::new();
for sub in self.program.term.subs.iter() {
for block in sub.term.blocks.iter() {
tid_to_block_map.insert(block.tid.clone(), block);
}
}
tid_to_block_map
}
/// Generate a map from all `Sub` TIDs to the set TIDs of all contained blocks in the `Sub`.
/// Used for the [`Project::make_block_to_sub_mapping_unique`] normalization pass,
/// as this function assumes that there may exist blocks contained in more than one `Sub`.
fn generate_sub_tid_to_contained_block_tids_map(
&self,
block_tid_to_block_map: &HashMap<Tid, &Term<Blk>>,
) -> HashMap<Tid, HashSet<Tid>> {
let mut sub_to_blocks_map = HashMap::new();
for sub in self.program.term.subs.iter() {
let mut worklist: Vec<Tid> =
sub.term.blocks.iter().map(|blk| blk.tid.clone()).collect();
let mut block_set = HashSet::new();
while let Some(block_tid) = worklist.pop() {
if block_set.get(&block_tid).is_none() {
block_set.insert(block_tid.clone());
if let Some(block) = block_tid_to_block_map.get(&block_tid) {
for jmp in block.term.jmps.iter() {
if let Some(tid) = jmp.get_intraprocedural_target_or_return_block_tid()
{
if block_set.get(&tid).is_none() {
worklist.push(tid);
}
}
}
for target_tid in block.term.indirect_jmp_targets.iter() {
if block_set.get(target_tid).is_none() {
worklist.push(target_tid.clone())
}
}
}
}
}
sub_to_blocks_map.insert(sub.tid.clone(), block_set);
}
sub_to_blocks_map
}
/// Create duplicates of blocks that are contained in several subfunctions.
///
/// The TIDs of the newly created blocks and the contained Defs and Jmps are appended
/// with the TID of the sub they are contained in
/// (to ensure that the newly created terms have unique TIDs).
/// The TIDs of jump and return targets are not adjusted in this function.
/// The returned map maps the TID of a `Sub` to the newly created blocks for that `Sub`.
///
/// This function is part of the [`Project::make_block_to_sub_mapping_unique`] normalization pass
/// and should not be used for other purposes.
fn duplicate_blocks_contained_in_several_subs(
&self,
sub_to_blocks_map: &HashMap<Tid, HashSet<Tid>>,
tid_to_sub_map: &HashMap<Tid, Tid>,
block_tid_to_block_map: &HashMap<Tid, &Term<Blk>>,
) -> HashMap<Tid, Vec<Term<Blk>>> {
// Generate new blocks without adjusting jump TIDs
let mut sub_to_additional_blocks_map = HashMap::new();
for sub in self.program.term.subs.iter() {
let tid_suffix = format!("_{}", sub.tid);
let mut additional_blocks = Vec::new();
for block_tid in sub_to_blocks_map.get(&sub.tid).unwrap() {
if tid_to_sub_map.get(block_tid) != Some(&sub.tid) {
let block = block_tid_to_block_map
.get(block_tid)
.unwrap()
.clone_with_tid_suffix(&tid_suffix);
additional_blocks.push(block);
}
}
sub_to_additional_blocks_map.insert(sub.tid.clone(), additional_blocks);
}
sub_to_additional_blocks_map
}
/// Appends the `Sub` TID to targets of intraprocedural jumps
/// if the target block was duplicated by the [`Project::duplicate_blocks_contained_in_several_subs`] function,
/// so that the jumps target the correct blocks again.
///
/// This function is part of the [`Project::make_block_to_sub_mapping_unique`] normalization pass
/// and should not be used for other purposes.
fn append_jump_targets_with_sub_suffix_when_target_block_was_duplicated(
&mut self,
tid_to_original_sub_map: &HashMap<Tid, Tid>,
) {
for sub in self.program.term.subs.iter_mut() {
let tid_suffix = format!("_{}", sub.tid);
for block in sub.term.blocks.iter_mut() {
for jump in block.term.jmps.iter_mut() {
match &mut jump.term {
Jmp::BranchInd(_) | Jmp::Return(_) => (),
Jmp::Branch(target) | Jmp::CBranch { target, .. } => {
if tid_to_original_sub_map.get(target) != Some(&sub.tid) {
*target = target.clone().with_id_suffix(&tid_suffix);
}
}
Jmp::Call { return_, .. }
| Jmp::CallInd { return_, .. }
| Jmp::CallOther { return_, .. } => {
if let Some(target) = return_ {
if tid_to_original_sub_map.get(target) != Some(&sub.tid) {
*target = target.clone().with_id_suffix(&tid_suffix);
}
}
}
}
}
for target in block.term.indirect_jmp_targets.iter_mut() {
if tid_to_original_sub_map.get(target) != Some(&sub.tid) {
*target = target.clone().with_id_suffix(&tid_suffix);
}
}
}
}
}
}
/// Create copies of blocks that are contained in more than one subroutine
/// so that each subroutine has its own unique copy of the block.
///
/// The TIDs of the copied blocks (and the contained `Def` and `Jmp` terms)
/// are appended with the sub TID to ensure that TIDs remain globally unique.
/// Target TIDs of intraprocedural jumps are also adjusted
/// to target the sub-specific copy of a block if the target block was duplicated.
pub fn make_block_to_sub_mapping_unique(project: &mut Project) {
let tid_to_sub_map = project.generate_tid_to_sub_tid_map();
let block_tid_to_block_map = project.generate_block_tid_to_block_term_map();
let sub_to_blocks_map =
project.generate_sub_tid_to_contained_block_tids_map(&block_tid_to_block_map);
let mut sub_to_additional_blocks_map = project.duplicate_blocks_contained_in_several_subs(
&sub_to_blocks_map,
&tid_to_sub_map,
&block_tid_to_block_map,
);
// Add the new blocks to the subs
for sub in project.program.term.subs.iter_mut() {
sub.term
.blocks
.append(&mut sub_to_additional_blocks_map.remove(&sub.tid).unwrap());
}
// Intraprocedural jumps need to be adjusted so that they target the sub-specific duplicates.
project.append_jump_targets_with_sub_suffix_when_target_block_was_duplicated(&tid_to_sub_map);
}
#[cfg(test)]
mod tests {
use super::*;
fn create_block_with_jump_target(block_name: &str, target_name: &str) -> Term<Blk> {
Term {
tid: Tid::new(block_name),
term: Blk {
defs: Vec::new(),
jmps: vec![Term {
tid: Tid::new(format!("jmp_{}", block_name)),
term: Jmp::Branch(Tid::new(target_name)),
}],
indirect_jmp_targets: Vec::new(),
},
}
}
fn create_sub_with_blocks(sub_name: &str, blocks: Vec<Term<Blk>>) -> Term<Sub> {
Term {
tid: Tid::new(sub_name),
term: Sub {
name: sub_name.to_string(),
blocks,
},
}
}
#[test]
fn duplication_of_blocks_contained_in_several_subs() {
let sub_1 = create_sub_with_blocks(
"sub_1",
vec![
create_block_with_jump_target("blk_1", "blk_2"),
create_block_with_jump_target("blk_2", "blk_1"),
],
);
let sub_2 = create_sub_with_blocks(
"sub_2",
vec![create_block_with_jump_target("blk_3", "blk_2")],
);
let sub_3 = create_sub_with_blocks(
"sub_3",
vec![create_block_with_jump_target("blk_4", "blk_3")],
);
let mut project = Project::mock_empty();
project.program.term.subs = vec![sub_1.clone(), sub_2, sub_3];
make_block_to_sub_mapping_unique(&mut project);
assert_eq!(&project.program.term.subs[0], &sub_1);
let sub_2_modified = create_sub_with_blocks(
"sub_2",
vec![
create_block_with_jump_target("blk_3", "blk_2_sub_2"),
create_block_with_jump_target("blk_2_sub_2", "blk_1_sub_2"),
create_block_with_jump_target("blk_1_sub_2", "blk_2_sub_2"),
],
);
assert_eq!(project.program.term.subs[1].term.blocks.len(), 3);
assert_eq!(
&project.program.term.subs[1].term.blocks[0],
&sub_2_modified.term.blocks[0]
);
assert!(project.program.term.subs[1]
.term
.blocks
.contains(&sub_2_modified.term.blocks[1]));
assert!(project.program.term.subs[1]
.term
.blocks
.contains(&sub_2_modified.term.blocks[2]));
let sub_3_modified = create_sub_with_blocks(
"sub_3",
vec![
create_block_with_jump_target("blk_4", "blk_3_sub_3"),
create_block_with_jump_target("blk_3_sub_3", "blk_2_sub_3"),
create_block_with_jump_target("blk_2_sub_3", "blk_1_sub_3"),
create_block_with_jump_target("blk_1_sub_3", "blk_2_sub_3"),
],
);
assert_eq!(project.program.term.subs[2].term.blocks.len(), 4);
assert_eq!(
&project.program.term.subs[2].term.blocks[0],
&sub_3_modified.term.blocks[0]
);
assert!(project.program.term.subs[2]
.term
.blocks
.contains(&sub_3_modified.term.blocks[0]));
assert!(project.program.term.subs[2]
.term
.blocks
.contains(&sub_3_modified.term.blocks[1]));
assert!(project.program.term.subs[2]
.term
.blocks
.contains(&sub_3_modified.term.blocks[2]));
assert!(project.program.term.subs[2]
.term
.blocks
.contains(&sub_3_modified.term.blocks[3]));
}
}
use super::{Blk, Datatype, Project, Variable};
use crate::prelude::*;
/// A `Sub` or subroutine represents a function with a given name and a list of basic blocks belonging to it.
///
/// Subroutines are *single-entry*,
/// i.e. calling a subroutine will execute the first block in the list of basic blocks.
/// A subroutine may have multiple exits, which are identified by `Jmp::Return` instructions.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Sub {
/// The name of the subroutine
pub name: String,
/// The basic blocks belonging to the subroutine.
/// The first block is also the entry point of the subroutine.
pub blocks: Vec<Term<Blk>>,
}
/// A parameter or return argument of a function.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub enum Arg {
/// The argument is passed in the given register
Register {
/// The variable object representing the register.
var: Variable,
/// An optional data type indicator.
data_type: Option<Datatype>,
},
/// The argument is passed on the stack.
/// It is positioned at the given offset (in bytes) relative to the stack pointer on function entry
/// and has the given size.
Stack {
/// The position of the argument on the stack
/// given as offset relative to the stack pointer on function entry.
offset: i64,
/// The size in bytes of the argument.
size: ByteSize,
/// An optional data type indicator.
data_type: Option<Datatype>,
},
}
impl Arg {
/// Returns the data type field of an Arg object.
pub fn get_data_type(&self) -> Option<Datatype> {
match self {
Arg::Register { data_type, .. } => data_type.clone(),
Arg::Stack { data_type, .. } => data_type.clone(),
}
}
}
/// An extern symbol represents a funtion that is dynamically linked from another binary.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct ExternSymbol {
/// The term ID of the extern symbol.
pub tid: Tid,
/// Addresses of possibly multiple locations of the same extern symbol
pub addresses: Vec<String>,
/// The name of the extern symbol
pub name: String,
/// The calling convention used for the extern symbol if known
pub calling_convention: Option<String>,
/// Parameters of an extern symbol.
/// May be empty if there are no parameters or the parameters are unknown.
pub parameters: Vec<Arg>,
/// Return values of an extern symbol.
/// May be empty if there is no return value or the return values are unknown.
pub return_values: Vec<Arg>,
/// If set to `true`, the function is assumed to never return to its caller when called.
pub no_return: bool,
/// If the function has a variable number of parameters, this flag is set to `true`.
pub has_var_args: bool,
}
impl ExternSymbol {
/// If the extern symbol has exactly one return value that is passed in a register,
/// return the register.
pub fn get_unique_return_register(&self) -> Result<&Variable, Error> {
if self.return_values.len() == 1 {
match self.return_values[0] {
Arg::Register { ref var, .. } => Ok(var),
Arg::Stack { .. } => Err(anyhow!("Return value is passed on the stack")),
}
} else {
Err(anyhow!("Wrong number of return values"))
}
}
/// If the extern symbol has exactly one parameter, return the parameter.
pub fn get_unique_parameter(&self) -> Result<&Arg, Error> {
if self.parameters.len() == 1 {
Ok(&self.parameters[0])
} else {
Err(anyhow!("Wrong number of parameter values"))
}
}
/// Get the calling convention corresponding to the extern symbol.
pub fn get_calling_convention<'a>(&self, project: &'a Project) -> &'a CallingConvention {
let cconv_name: &str = self.calling_convention.as_deref().unwrap_or("default");
project
.calling_conventions
.iter()
.find(|cconv| cconv.name == cconv_name)
.unwrap()
}
}
/// Calling convention related data
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct CallingConvention {
/// The name of the calling convention
#[serde(rename = "calling_convention")]
pub name: String,
/// Possible integer parameter registers.
pub integer_parameter_register: Vec<String>,
/// Possible float parameter registers.
pub float_parameter_register: Vec<String>,
/// A list of possible return register
pub return_register: Vec<String>,
/// A list of callee-saved register,
/// i.e. the values of these registers should be the same after the call as they were before the call.
pub callee_saved_register: Vec<String>,
}
#[cfg(test)]
mod tests {
use super::*;
impl Sub {
pub fn mock(name: impl ToString) -> Term<Sub> {
Term {
tid: Tid::new(name.to_string()),
term: Sub {
name: name.to_string(),
blocks: Vec::new(),
},
}
}
}
impl CallingConvention {
pub fn mock() -> CallingConvention {
CallingConvention {
name: "__stdcall".to_string(), // so that the mock is useable as standard calling convention in tests
integer_parameter_register: vec!["RDI".to_string()],
float_parameter_register: vec!["XMMO".to_string()],
return_register: vec!["RAX".to_string()],
callee_saved_register: vec!["RBP".to_string()],
}
}
pub fn mock_with_parameter_registers(
integer_parameter_register: Vec<String>,
float_parameter_register: Vec<String>,
) -> CallingConvention {
CallingConvention {
name: "__stdcall".to_string(), // so that the mock is useable as standard calling convention in tests
integer_parameter_register,
float_parameter_register,
return_register: vec!["RAX".to_string()],
callee_saved_register: vec!["RBP".to_string()],
}
}
}
impl Arg {
pub fn mock_register(name: impl ToString, size_in_bytes: impl Into<ByteSize>) -> Arg {
Arg::Register {
var: Variable::mock(name.to_string(), size_in_bytes),
data_type: None,
}
}
}
impl ExternSymbol {
pub fn mock() -> ExternSymbol {
ExternSymbol {
tid: Tid::new("mock_symbol"),
addresses: vec!["UNKNOWN".to_string()],
name: "mock_symbol".to_string(),
calling_convention: Some("__stdcall".to_string()),
parameters: vec![Arg::mock_register("RDI", 8)],
return_values: vec![Arg::mock_register("RAX", 8)],
no_return: false,
has_var_args: false,
}
}
}
}
use super::{ByteSize, CastOpType, Datatype, DatatypeProperties, Expression, Variable};
use crate::prelude::*;
use crate::utils::log::LogMessage;
use std::collections::{HashMap, HashSet};
mod builder;
......@@ -60,1304 +57,3 @@ pub struct Term<T> {
/// The object
pub term: T,
}
/// A side-effectful operation.
/// Can be a register assignment or a memory load/store operation.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub enum Def {
/// A memory load into the register given by `var`.
Load {
/// The target register of the memory load.
/// The size of `var` also determines the number of bytes read from memory.
var: Variable,
/// The expression computing the address from which to read from.
/// The size of `address` is required to match the pointer size of the corresponding CPU architecture.
address: Expression,
},
/// A memory store operation.
Store {
/// The expression computing the address that is written to.
/// The size of `address` is required to match the pointer size of the corresponding CPU architecture.
address: Expression,
/// The expression computing the value that is written to memory.
/// The size of `value` also determines the number of bytes written.
value: Expression,
},
/// A register assignment, assigning the result of the expression `value` to the register `var`.
Assign {
/// The register that is written to.
var: Variable,
/// The expression computing the value that is assigned to the register.
value: Expression,
},
}
impl Term<Def> {
/// This function checks whether the instruction
/// is a zero extension of the overwritten sub register of the previous instruction.
/// If so, returns its TID
pub fn check_for_zero_extension(
&self,
output_name: String,
output_sub_register: String,
) -> Option<Tid> {
match &self.term {
Def::Assign {
var,
value:
Expression::Cast {
op: CastOpType::IntZExt,
arg,
..
},
} if output_name == var.name => {
let argument: &Expression = arg;
match argument {
Expression::Var(var) if var.name == output_sub_register => {
Some(self.tid.clone())
}
_ => None,
}
}
_ => None,
}
}
/// Substitute every occurence of `input_var` in the address and value expressions
/// with `replace_with_expression`.
/// Does not change the target variable of assignment- and load-instructions.
pub fn substitute_input_var(
&mut self,
input_var: &Variable,
replace_with_expression: &Expression,
) {
match &mut self.term {
Def::Assign { var: _, value } => {
value.substitute_input_var(input_var, replace_with_expression)
}
Def::Load { var: _, address } => {
address.substitute_input_var(input_var, replace_with_expression)
}
Def::Store { address, value } => {
address.substitute_input_var(input_var, replace_with_expression);
value.substitute_input_var(input_var, replace_with_expression);
}
}
}
}
/// A `Jmp` instruction affects the control flow of a program, i.e. it may change the instruction pointer.
/// With the exception of `CallOther`, it has no other side effects.
///
/// `Jmp` instructions carry some semantic information with it, like whether a jump is intra- or interprocedural.
/// Note that this semantic information may not always be correct.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub enum Jmp {
/// A direct intraprocedural jump to the targeted `Blk` term identifier.
Branch(Tid),
/// An indirect intraprocedural jump to the address that the given expression evaluates to.
BranchInd(Expression),
/// A direct intraprocedural jump that is only taken if the condition evaluates to true (i.e. not zero).
CBranch {
/// The term ID of the target block of the jump.
target: Tid,
/// The jump is only taken if this expression evaluates to `true`, (i.e. not zero).
condition: Expression,
},
/// A direct interprocedural jump representing a subroutine call.
///
/// Note that this is syntactically equivalent to a `Jmp::Branch`.
Call {
/// The term ID of the target subroutine (`Sub`) or extern symbol of the call.
target: Tid,
/// The term ID of the block that the called function returns to.
/// May be `None` if it is assumed that the called function never returns.
return_: Option<Tid>,
},
/// An indirect interprocedural jump to the address the `target` expression evaluates to
/// and representing a subroutine call.
///
/// Note that this is syntactically equivalent to a `Jmp::BranchInd`.
CallInd {
/// An expression computing the target address of the call.
target: Expression,
/// The term ID of the block that the called function returns to.
/// May be `None` if it is assumed that the called function never returns.
return_: Option<Tid>,
},
/// A indirect interprocedural jump indicating a return from a subroutine.
///
/// Note that this is syntactically equivalent to a `Jmp::BranchInd`.
Return(Expression),
/// This instruction is used for all side effects that are not representable by other instructions
/// or not supported by the disassembler.
///
/// E.g. syscalls and other interrupts are mapped to `CallOther`.
/// Assembly instructions that the disassembler does not support are also mapped to `CallOther`.
/// One can use the `description` field to match for and handle known side effects (e.g. syscalls).
CallOther {
/// A description of the side effect.
description: String,
/// The block term identifier of the block
/// where the disassembler assumes that execution will continue after handling of the side effect.
return_: Option<Tid>,
},
}
impl Term<Jmp> {
/// If the jump is intraprocedural, return its target TID.
/// If the jump is a call, return the TID of the return target.
fn get_intraprocedural_target_or_return_block_tid(&self) -> Option<Tid> {
match &self.term {
Jmp::BranchInd(_) | Jmp::Return(_) => None,
Jmp::Branch(tid) => Some(tid.clone()),
Jmp::CBranch { target, .. } => Some(target.clone()),
Jmp::Call { return_, .. }
| Jmp::CallInd { return_, .. }
| Jmp::CallOther { return_, .. } => return_.as_ref().cloned(),
}
}
/// If the TID of a jump target or return target is not contained in `known_tids`
/// replace it with a dummy TID and return an error message.
fn retarget_nonexisting_jump_targets_to_dummy_tid(
&mut self,
known_tids: &HashSet<Tid>,
dummy_sub_tid: &Tid,
dummy_blk_tid: &Tid,
) -> Result<(), LogMessage> {
use Jmp::*;
match &mut self.term {
BranchInd(_) => (),
Branch(tid) | CBranch { target: tid, .. } if known_tids.get(tid).is_none() => {
let error_msg = format!("Jump target at {} does not exist", tid.address);
let error_log = LogMessage::new_error(error_msg).location(self.tid.clone());
*tid = dummy_blk_tid.clone();
return Err(error_log);
}
Call { target, return_ } if known_tids.get(target).is_none() => {
let error_msg = format!("Call target at {} does not exist", target.address);
let error_log = LogMessage::new_error(error_msg).location(self.tid.clone());
*target = dummy_sub_tid.clone();
*return_ = None;
return Err(error_log);
}
Call {
return_: Some(return_tid),
..
}
| CallInd {
return_: Some(return_tid),
..
}
| CallOther {
return_: Some(return_tid),
..
} if known_tids.get(return_tid).is_none() => {
let error_msg = format!("Return target at {} does not exist", return_tid.address);
let error_log = LogMessage::new_error(error_msg).location(self.tid.clone());
*return_tid = dummy_blk_tid.clone();
return Err(error_log);
}
_ => (),
}
Ok(())
}
}
/// A basic block is a sequence of `Def` instructions followed by up to two `Jmp` instructions.
///
/// The `Def` instructions represent side-effectful operations that are executed in order when the block is entered.
/// `Def` instructions do not affect the control flow of a program.
///
/// The `Jmp` instructions represent control flow affecting operations.
/// There can only be zero, one or two `Jmp`s:
/// - Zero `Jmp`s indicate that the next execution to be executed could not be discerned.
/// This should only happen on disassembler errors or on dead ends in the control flow graph that were deliberately inserted by the user.
/// - If there is exactly one `Jmp`, it is required to be an unconditional jump.
/// - For two jumps, the first one has to be a conditional jump,
/// where the second unconditional jump is only taken if the condition of the first jump evaluates to false.
///
/// If one of the `Jmp` instructions is an indirect jump,
/// then the `indirect_jmp_targets` is a list of possible jump target addresses for that jump.
/// The list may not be complete and the entries are not guaranteed to be correct.
///
/// Basic blocks are *single entry, single exit*, i.e. a basic block is only entered at the beginning
/// and is only exited by the jump instructions at the end of the block.
/// If a new control flow edge is discovered that would jump to the middle of a basic block,
/// the block structure needs to be updated accordingly.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Blk {
/// The `Def` instructions of the basic block in order of execution.
pub defs: Vec<Term<Def>>,
/// The `Jmp` instructions of the basic block
pub jmps: Vec<Term<Jmp>>,
/// If the basic block contains an indirect jump,
/// this field contains possible jump target addresses for the jump.
///
/// Note that possible targets of indirect calls are *not* contained,
/// since the [`Project::make_block_to_sub_mapping_unique`] normalization pass assumes
/// that only intraprocedural jump targets are contained in this field.
pub indirect_jmp_targets: Vec<Tid>,
}
impl Term<Blk> {
/// Return a clone of `self` where the given suffix is appended to
/// the TIDs of all contained terms (the block itself and all `Jmp`s and `Def`s).
///
/// Note that all TIDs of jump targets (direct, indirect and return targets) are left unchanged.
fn clone_with_tid_suffix(&self, suffix: &str) -> Self {
let mut cloned_block = self.clone();
cloned_block.tid = cloned_block.tid.with_id_suffix(suffix);
for def in cloned_block.term.defs.iter_mut() {
def.tid = def.tid.clone().with_id_suffix(suffix);
}
for jmp in cloned_block.term.jmps.iter_mut() {
jmp.tid = jmp.tid.clone().with_id_suffix(suffix);
}
cloned_block
}
/// Remove indirect jump target addresses for which no corresponding target block exists.
/// Return an error message for each removed address.
pub fn remove_nonexisting_indirect_jump_targets(
&mut self,
known_block_tids: &HashSet<Tid>,
) -> Result<(), Vec<LogMessage>> {
let mut logs = Vec::new();
self.term.indirect_jmp_targets = self
.term
.indirect_jmp_targets
.iter()
.filter_map(|target| {
if known_block_tids.get(&target).is_some() {
Some(target.clone())
} else {
let error_msg =
format!("Indirect jump target at {} does not exist", target.address);
logs.push(LogMessage::new_error(error_msg).location(self.tid.clone()));
None
}
})
.collect();
if logs.is_empty() {
Ok(())
} else {
Err(logs)
}
}
/// Wherever possible, substitute input variables of expressions
/// with the input expression that defines the input variable.
///
/// Note that substitution is only possible
/// if the input variables of the input expression itself did not change since the definition of said variable.
///
/// The expression propagation allows the [`Project::substitute_trivial_expressions`] normalization pass
/// to further simplify the generated expressions
/// and allows more dead stores to be removed during [dead variable elimination](`crate::analysis::dead_variable_elimination`).
pub fn propagate_input_expressions(&mut self) {
let mut insertable_expressions = Vec::new();
for def in self.term.defs.iter_mut() {
match &mut def.term {
Def::Assign {
var,
value: expression,
} => {
// insert known input expressions
for (input_var, input_expr) in insertable_expressions.iter() {
expression.substitute_input_var(input_var, input_expr);
}
// expressions dependent on the assigned variable are no longer insertable
insertable_expressions.retain(|(input_var, input_expr)| {
input_var != var && !input_expr.input_vars().into_iter().any(|x| x == var)
});
// If the value of the assigned variable does not depend on the former value of the variable,
// then it is insertable for future expressions.
if !expression.input_vars().into_iter().any(|x| x == var) {
insertable_expressions.push((var.clone(), expression.clone()));
}
}
Def::Load {
var,
address: expression,
} => {
// insert known input expressions
for (input_var, input_expr) in insertable_expressions.iter() {
expression.substitute_input_var(input_var, input_expr);
}
// expressions dependent on the assigned variable are no longer insertable
insertable_expressions.retain(|(input_var, input_expr)| {
input_var != var && !input_expr.input_vars().into_iter().any(|x| x == var)
});
}
Def::Store { address, value } => {
// insert known input expressions
for (input_var, input_expr) in insertable_expressions.iter() {
address.substitute_input_var(input_var, input_expr);
value.substitute_input_var(input_var, input_expr);
}
}
}
}
for jump in self.term.jmps.iter_mut() {
match &mut jump.term {
Jmp::Branch(_) | Jmp::Call { .. } | Jmp::CallOther { .. } => (),
Jmp::BranchInd(expr)
| Jmp::CBranch {
condition: expr, ..
}
| Jmp::CallInd { target: expr, .. }
| Jmp::Return(expr) => {
// insert known input expressions
for (input_var, input_expr) in insertable_expressions.iter() {
expr.substitute_input_var(input_var, input_expr);
}
}
}
}
}
/// Merge subsequent assignments to the same variable to a single assignment to that variable.
///
/// The value expressions of merged assignments can often be simplified later on
/// in the [`Project::substitute_trivial_expressions`] normalization pass.
pub fn merge_def_assignments_to_same_var(&mut self) {
let mut new_defs = Vec::new();
let mut last_def_opt = None;
for def in self.term.defs.iter() {
if let Def::Assign {
var: current_var, ..
} = &def.term
{
if let Some(Term {
term:
Def::Assign {
var: last_var,
value: last_value,
},
..
}) = &last_def_opt
{
if current_var == last_var {
let mut substituted_def = def.clone();
substituted_def.substitute_input_var(last_var, last_value);
last_def_opt = Some(substituted_def);
} else {
new_defs.push(last_def_opt.unwrap());
last_def_opt = Some(def.clone());
}
} else if last_def_opt.is_some() {
panic!(); // Only assign-defs should be saved in last_def.
} else {
last_def_opt = Some(def.clone());
}
} else {
if let Some(last_def) = last_def_opt {
new_defs.push(last_def);
}
new_defs.push(def.clone());
last_def_opt = None;
}
}
if let Some(last_def) = last_def_opt {
new_defs.push(last_def);
}
self.term.defs = new_defs;
}
}
/// A `Sub` or subroutine represents a function with a given name and a list of basic blocks belonging to it.
///
/// Subroutines are *single-entry*,
/// i.e. calling a subroutine will execute the first block in the list of basic blocks.
/// A subroutine may have multiple exits, which are identified by `Jmp::Return` instructions.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Sub {
/// The name of the subroutine
pub name: String,
/// The basic blocks belonging to the subroutine.
/// The first block is also the entry point of the subroutine.
pub blocks: Vec<Term<Blk>>,
}
/// A parameter or return argument of a function.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub enum Arg {
/// The argument is passed in the given register
Register {
/// The variable object representing the register.
var: Variable,
/// An optional data type indicator.
data_type: Option<Datatype>,
},
/// The argument is passed on the stack.
/// It is positioned at the given offset (in bytes) relative to the stack pointer on function entry
/// and has the given size.
Stack {
/// The position of the argument on the stack
/// given as offset relative to the stack pointer on function entry.
offset: i64,
/// The size in bytes of the argument.
size: ByteSize,
/// An optional data type indicator.
data_type: Option<Datatype>,
},
}
impl Arg {
/// Returns the data type field of an Arg object.
pub fn get_data_type(&self) -> Option<Datatype> {
match self {
Arg::Register { data_type, .. } => data_type.clone(),
Arg::Stack { data_type, .. } => data_type.clone(),
}
}
}
/// An extern symbol represents a funtion that is dynamically linked from another binary.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct ExternSymbol {
/// The term ID of the extern symbol.
pub tid: Tid,
/// Addresses of possibly multiple locations of the same extern symbol
pub addresses: Vec<String>,
/// The name of the extern symbol
pub name: String,
/// The calling convention used for the extern symbol if known
pub calling_convention: Option<String>,
/// Parameters of an extern symbol.
/// May be empty if there are no parameters or the parameters are unknown.
pub parameters: Vec<Arg>,
/// Return values of an extern symbol.
/// May be empty if there is no return value or the return values are unknown.
pub return_values: Vec<Arg>,
/// If set to `true`, the function is assumed to never return to its caller when called.
pub no_return: bool,
/// If the function has a variable number of parameters, this flag is set to `true`.
pub has_var_args: bool,
}
impl ExternSymbol {
/// If the extern symbol has exactly one return value that is passed in a register,
/// return the register.
pub fn get_unique_return_register(&self) -> Result<&Variable, Error> {
if self.return_values.len() == 1 {
match self.return_values[0] {
Arg::Register { ref var, .. } => Ok(var),
Arg::Stack { .. } => Err(anyhow!("Return value is passed on the stack")),
}
} else {
Err(anyhow!("Wrong number of return values"))
}
}
/// If the extern symbol has exactly one parameter, return the parameter.
pub fn get_unique_parameter(&self) -> Result<&Arg, Error> {
if self.parameters.len() == 1 {
Ok(&self.parameters[0])
} else {
Err(anyhow!("Wrong number of parameter values"))
}
}
/// Get the calling convention corresponding to the extern symbol.
pub fn get_calling_convention<'a>(&self, project: &'a Project) -> &'a CallingConvention {
let cconv_name: &str = self.calling_convention.as_deref().unwrap_or("default");
project
.calling_conventions
.iter()
.find(|cconv| cconv.name == cconv_name)
.unwrap()
}
}
/// The `Program` structure represents a disassembled binary.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Program {
/// The known functions contained in the binary
pub subs: Vec<Term<Sub>>,
/// Extern symbols linked to the binary by the linker.
pub extern_symbols: Vec<ExternSymbol>,
/// Entry points into to binary,
/// i.e. the term identifiers of functions that may be called from outside of the binary.
pub entry_points: Vec<Tid>,
/// An offset that has been added to all addresses in the program compared to the addresses
/// as specified in the binary file.
///
/// In certain cases, e.g. if the binary specifies a segment to be loaded at address 0,
/// the Ghidra backend may shift the whole binary image by a constant value in memory.
/// Thus addresses as specified by the binary and addresses as reported by Ghidra may differ by a constant offset,
/// which is stored in this value.
pub address_base_offset: u64,
}
impl Program {
/// Find a block term by its term identifier.
/// WARNING: The function simply iterates through all blocks,
/// i.e. it is very inefficient for large projects!
pub fn find_block(&self, tid: &Tid) -> Option<&Term<Blk>> {
self.subs
.iter()
.map(|sub| sub.term.blocks.iter())
.flatten()
.find(|block| block.tid == *tid)
}
}
/// Calling convention related data
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct CallingConvention {
/// The name of the calling convention
#[serde(rename = "calling_convention")]
pub name: String,
/// Possible integer parameter registers.
pub integer_parameter_register: Vec<String>,
/// Possible float parameter registers.
pub float_parameter_register: Vec<String>,
/// A list of possible return register
pub return_register: Vec<String>,
/// A list of callee-saved register,
/// i.e. the values of these registers should be the same after the call as they were before the call.
pub callee_saved_register: Vec<String>,
}
/// The `Project` struct is the main data structure representing a binary.
///
/// It contains information about the disassembled binary
/// and about the execution environment of the binary.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Project {
/// All (known) executable code of the binary is contained in the `program` term.
pub program: Term<Program>,
/// The CPU architecture on which the binary is assumed to be executed.
pub cpu_architecture: String,
/// The stack pointer register for the given CPU architecture.
pub stack_pointer_register: Variable,
/// The known calling conventions that may be used for calls to extern functions.
pub calling_conventions: Vec<CallingConvention>,
/// A list of all known physical registers for the CPU architecture.
/// Does only contain base registers, i.e. sub registers of other registers are not contained.
pub register_list: Vec<Variable>,
/// Contains the properties of C data types. (e.g. size)
pub datatype_properties: DatatypeProperties,
}
impl Project {
/// Return the size (in bytes) for pointers of the given architecture.
pub fn get_pointer_bytesize(&self) -> ByteSize {
self.stack_pointer_register.size
}
/// Try to guess a standard calling convention from the list of calling conventions in the project.
pub fn get_standard_calling_convention(&self) -> Option<&CallingConvention> {
self.calling_conventions
.iter()
.find(|cconv| cconv.name == "__stdcall" || cconv.name == "__cdecl")
}
}
impl Project {
/// For all expressions contained in the project,
/// replace trivially computable subexpressions like `a XOR a` with their result.
fn substitute_trivial_expressions(&mut self) {
for sub in self.program.term.subs.iter_mut() {
for block in sub.term.blocks.iter_mut() {
for def in block.term.defs.iter_mut() {
match &mut def.term {
Def::Assign { value: expr, .. } | Def::Load { address: expr, .. } => {
expr.substitute_trivial_operations()
}
Def::Store { address, value } => {
address.substitute_trivial_operations();
value.substitute_trivial_operations();
}
}
}
for jmp in block.term.jmps.iter_mut() {
match &mut jmp.term {
Jmp::Branch(_) | Jmp::Call { .. } | Jmp::CallOther { .. } => (),
Jmp::BranchInd(expr)
| Jmp::CBranch {
condition: expr, ..
}
| Jmp::CallInd { target: expr, .. }
| Jmp::Return(expr) => expr.substitute_trivial_operations(),
}
}
}
}
}
/// Generate a map from all `Sub`, `Blk`, `Def` and `Jmp` TIDs of the project
/// to the `Sub` TID in which the term is contained.
fn generate_tid_to_sub_tid_map(&self) -> HashMap<Tid, Tid> {
let mut tid_to_sub_map = HashMap::new();
for sub in self.program.term.subs.iter() {
tid_to_sub_map.insert(sub.tid.clone(), sub.tid.clone());
for block in sub.term.blocks.iter() {
tid_to_sub_map.insert(block.tid.clone(), sub.tid.clone());
for def in block.term.defs.iter() {
tid_to_sub_map.insert(def.tid.clone(), sub.tid.clone());
}
for jmp in block.term.jmps.iter() {
tid_to_sub_map.insert(jmp.tid.clone(), sub.tid.clone());
}
}
}
tid_to_sub_map
}
/// Generate a map mapping all block TIDs to the corresponding block.
fn generate_block_tid_to_block_term_map(&self) -> HashMap<Tid, &Term<Blk>> {
let mut tid_to_block_map = HashMap::new();
for sub in self.program.term.subs.iter() {
for block in sub.term.blocks.iter() {
tid_to_block_map.insert(block.tid.clone(), block);
}
}
tid_to_block_map
}
/// Generate a map from all `Sub` TIDs to the set TIDs of all contained blocks in the `Sub`.
/// Used for the [`Project::make_block_to_sub_mapping_unique`] normalization pass,
/// as this function assumes that there may exist blocks contained in more than one `Sub`.
fn generate_sub_tid_to_contained_block_tids_map(
&self,
block_tid_to_block_map: &HashMap<Tid, &Term<Blk>>,
) -> HashMap<Tid, HashSet<Tid>> {
let mut sub_to_blocks_map = HashMap::new();
for sub in self.program.term.subs.iter() {
let mut worklist: Vec<Tid> =
sub.term.blocks.iter().map(|blk| blk.tid.clone()).collect();
let mut block_set = HashSet::new();
while let Some(block_tid) = worklist.pop() {
if block_set.get(&block_tid).is_none() {
block_set.insert(block_tid.clone());
if let Some(block) = block_tid_to_block_map.get(&block_tid) {
for jmp in block.term.jmps.iter() {
if let Some(tid) = jmp.get_intraprocedural_target_or_return_block_tid()
{
if block_set.get(&tid).is_none() {
worklist.push(tid);
}
}
}
for target_tid in block.term.indirect_jmp_targets.iter() {
if block_set.get(target_tid).is_none() {
worklist.push(target_tid.clone())
}
}
}
}
}
sub_to_blocks_map.insert(sub.tid.clone(), block_set);
}
sub_to_blocks_map
}
/// Create duplicates of blocks that are contained in several subfunctions.
///
/// The TIDs of the newly created blocks and the contained Defs and Jmps are appended
/// with the TID of the sub they are contained in
/// (to ensure that the newly created terms have unique TIDs).
/// The TIDs of jump and return targets are not adjusted in this function.
/// The returned map maps the TID of a `Sub` to the newly created blocks for that `Sub`.
///
/// This function is part of the [`Project::make_block_to_sub_mapping_unique`] normalization pass
/// and should not be used for other purposes.
fn duplicate_blocks_contained_in_several_subs(
&self,
sub_to_blocks_map: &HashMap<Tid, HashSet<Tid>>,
tid_to_sub_map: &HashMap<Tid, Tid>,
block_tid_to_block_map: &HashMap<Tid, &Term<Blk>>,
) -> HashMap<Tid, Vec<Term<Blk>>> {
// Generate new blocks without adjusting jump TIDs
let mut sub_to_additional_blocks_map = HashMap::new();
for sub in self.program.term.subs.iter() {
let tid_suffix = format!("_{}", sub.tid);
let mut additional_blocks = Vec::new();
for block_tid in sub_to_blocks_map.get(&sub.tid).unwrap() {
if tid_to_sub_map.get(block_tid) != Some(&sub.tid) {
let block = block_tid_to_block_map
.get(block_tid)
.unwrap()
.clone_with_tid_suffix(&tid_suffix);
additional_blocks.push(block);
}
}
sub_to_additional_blocks_map.insert(sub.tid.clone(), additional_blocks);
}
sub_to_additional_blocks_map
}
/// Appends the `Sub` TID to targets of intraprocedural jumps
/// if the target block was duplicated by the [`Project::duplicate_blocks_contained_in_several_subs`] function,
/// so that the jumps target the correct blocks again.
///
/// This function is part of the [`Project::make_block_to_sub_mapping_unique`] normalization pass
/// and should not be used for other purposes.
fn append_jump_targets_with_sub_suffix_when_target_block_was_duplicated(
&mut self,
tid_to_original_sub_map: &HashMap<Tid, Tid>,
) {
for sub in self.program.term.subs.iter_mut() {
let tid_suffix = format!("_{}", sub.tid);
for block in sub.term.blocks.iter_mut() {
for jump in block.term.jmps.iter_mut() {
match &mut jump.term {
Jmp::BranchInd(_) | Jmp::Return(_) => (),
Jmp::Branch(target) | Jmp::CBranch { target, .. } => {
if tid_to_original_sub_map.get(target) != Some(&sub.tid) {
*target = target.clone().with_id_suffix(&tid_suffix);
}
}
Jmp::Call { return_, .. }
| Jmp::CallInd { return_, .. }
| Jmp::CallOther { return_, .. } => {
if let Some(target) = return_ {
if tid_to_original_sub_map.get(target) != Some(&sub.tid) {
*target = target.clone().with_id_suffix(&tid_suffix);
}
}
}
}
}
for target in block.term.indirect_jmp_targets.iter_mut() {
if tid_to_original_sub_map.get(target) != Some(&sub.tid) {
*target = target.clone().with_id_suffix(&tid_suffix);
}
}
}
}
}
/// Create copies of blocks that are contained in more than one subroutine
/// so that each subroutine has its own unique copy of the block.
///
/// The TIDs of the copied blocks (and the contained `Def` and `Jmp` terms)
/// are appended with the sub TID to ensure that TIDs remain globally unique.
/// Target TIDs of intraprocedural jumps are also adjusted
/// to target the sub-specific copy of a block if the target block was duplicated.
fn make_block_to_sub_mapping_unique(&mut self) {
let tid_to_sub_map = self.generate_tid_to_sub_tid_map();
let block_tid_to_block_map = self.generate_block_tid_to_block_term_map();
let sub_to_blocks_map =
self.generate_sub_tid_to_contained_block_tids_map(&block_tid_to_block_map);
let mut sub_to_additional_blocks_map = self.duplicate_blocks_contained_in_several_subs(
&sub_to_blocks_map,
&tid_to_sub_map,
&block_tid_to_block_map,
);
// Add the new blocks to the subs
for sub in self.program.term.subs.iter_mut() {
sub.term
.blocks
.append(&mut sub_to_additional_blocks_map.remove(&sub.tid).unwrap());
}
// Intraprocedural jumps need to be adjusted so that they target the sub-specific duplicates.
self.append_jump_targets_with_sub_suffix_when_target_block_was_duplicated(&tid_to_sub_map);
}
/// Replace jumps to nonexisting TIDs with jumps to a dummy target
/// representing an artificial sink in the control flow graph.
/// Return a log message for each replaced jump target.
///
/// Nonexisting jump targets may be generated by the Ghidra backend
/// if the data at the target address is not a valid assembly instruction.
#[must_use]
fn remove_references_to_nonexisting_tids(&mut self) -> Vec<LogMessage> {
// Gather all existing jump targets
let mut jump_target_tids = HashSet::new();
for sub in self.program.term.subs.iter() {
jump_target_tids.insert(sub.tid.clone());
for block in sub.term.blocks.iter() {
jump_target_tids.insert(block.tid.clone());
}
}
for symbol in self.program.term.extern_symbols.iter() {
jump_target_tids.insert(symbol.tid.clone());
}
// Replace all jumps to non-existing jump targets with jumps to dummy targets
let dummy_sub_tid = Tid::new("Artificial Sink Sub");
let dummy_blk_tid = Tid::new("Artificial Sink Block");
let mut log_messages = Vec::new();
for sub in self.program.term.subs.iter_mut() {
for block in sub.term.blocks.iter_mut() {
if let Err(mut logs) =
block.remove_nonexisting_indirect_jump_targets(&jump_target_tids)
{
log_messages.append(&mut logs);
}
for jmp in block.term.jmps.iter_mut() {
if let Err(log_msg) = jmp.retarget_nonexisting_jump_targets_to_dummy_tid(
&jump_target_tids,
&dummy_sub_tid,
&dummy_blk_tid,
) {
log_messages.push(log_msg);
}
}
}
}
// If at least one dummy jump was inserted, add the corresponding dummy sub and block to the program.
if !log_messages.is_empty() {
let dummy_sub: Term<Sub> = Term {
tid: dummy_sub_tid,
term: Sub {
name: "Artificial Sink Sub".to_string(),
blocks: vec![Term {
tid: dummy_blk_tid,
term: Blk {
defs: Vec::new(),
jmps: Vec::new(),
indirect_jmp_targets: Vec::new(),
},
}],
},
};
self.program.term.subs.push(dummy_sub);
}
log_messages
}
/// Propagate input expressions along variable assignments.
///
/// The propagation only occurs inside basic blocks
/// but not across basic block boundaries.
fn propagate_input_expressions(&mut self) {
for sub in self.program.term.subs.iter_mut() {
for block in sub.term.blocks.iter_mut() {
block.merge_def_assignments_to_same_var();
block.propagate_input_expressions();
}
}
}
/// Run some normalization passes over the project.
///
/// Passes:
/// - Replace jumps to nonexisting TIDs with jumps to artificial sink targets in the CFG.
/// - Duplicate blocks so that if a block is contained in several functions, each function gets its own unique copy.
/// - Propagate input expressions along variable assignments.
/// - Replace trivial expressions like `a XOR a` with their result.
/// - Remove dead register assignments
#[must_use]
pub fn normalize(&mut self) -> Vec<LogMessage> {
let logs = self.remove_references_to_nonexisting_tids();
self.make_block_to_sub_mapping_unique();
self.propagate_input_expressions();
self.substitute_trivial_expressions();
crate::analysis::dead_variable_elimination::remove_dead_var_assignments(self);
logs
}
}
#[cfg(test)]
mod tests {
use crate::intermediate_representation::BinOpType;
use super::*;
impl Blk {
pub fn mock() -> Term<Blk> {
Term {
tid: Tid::new("block"),
term: Blk {
defs: Vec::new(),
jmps: Vec::new(),
indirect_jmp_targets: Vec::new(),
},
}
}
pub fn mock_with_tid(tid: &str) -> Term<Blk> {
Term {
tid: Tid::new(tid),
term: Blk {
defs: Vec::new(),
jmps: Vec::new(),
indirect_jmp_targets: Vec::new(),
},
}
}
}
impl Sub {
pub fn mock(name: impl ToString) -> Term<Sub> {
Term {
tid: Tid::new(name.to_string()),
term: Sub {
name: name.to_string(),
blocks: Vec::new(),
},
}
}
}
impl Program {
pub fn mock_empty() -> Program {
Program {
subs: Vec::new(),
extern_symbols: Vec::new(),
entry_points: Vec::new(),
address_base_offset: 0,
}
}
}
impl CallingConvention {
pub fn mock() -> CallingConvention {
CallingConvention {
name: "__stdcall".to_string(), // so that the mock is useable as standard calling convention in tests
integer_parameter_register: vec!["RDI".to_string()],
float_parameter_register: vec!["XMMO".to_string()],
return_register: vec!["RAX".to_string()],
callee_saved_register: vec!["RBP".to_string()],
}
}
pub fn mock_with_parameter_registers(
integer_parameter_register: Vec<String>,
float_parameter_register: Vec<String>,
) -> CallingConvention {
CallingConvention {
name: "__stdcall".to_string(), // so that the mock is useable as standard calling convention in tests
integer_parameter_register,
float_parameter_register,
return_register: vec!["RAX".to_string()],
callee_saved_register: vec!["RBP".to_string()],
}
}
}
impl Arg {
pub fn mock_register(name: impl ToString, size_in_bytes: impl Into<ByteSize>) -> Arg {
Arg::Register {
var: Variable::mock(name.to_string(), size_in_bytes),
data_type: None,
}
}
}
impl ExternSymbol {
pub fn mock() -> ExternSymbol {
ExternSymbol {
tid: Tid::new("mock_symbol"),
addresses: vec!["UNKNOWN".to_string()],
name: "mock_symbol".to_string(),
calling_convention: Some("__stdcall".to_string()),
parameters: vec![Arg::mock_register("RDI", 8)],
return_values: vec![Arg::mock_register("RAX", 8)],
no_return: false,
has_var_args: false,
}
}
}
impl DatatypeProperties {
pub fn mock() -> DatatypeProperties {
DatatypeProperties {
char_size: ByteSize::new(1),
double_size: ByteSize::new(8),
float_size: ByteSize::new(4),
integer_size: ByteSize::new(4),
long_double_size: ByteSize::new(8),
long_long_size: ByteSize::new(8),
long_size: ByteSize::new(4),
pointer_size: ByteSize::new(8),
short_size: ByteSize::new(2),
}
}
}
impl Project {
pub fn mock_empty() -> Project {
let register_list = vec!["RAX", "RCX", "RDX", "RBX", "RSP", "RBP", "RSI", "RDI"]
.into_iter()
.map(|name| Variable::mock(name, ByteSize::new(8)))
.collect();
Project {
program: Term {
tid: Tid::new("program_tid"),
term: Program::mock_empty(),
},
cpu_architecture: "x86_64".to_string(),
stack_pointer_register: Variable::mock("RSP", 8u64),
calling_conventions: Vec::new(),
register_list,
datatype_properties: DatatypeProperties::mock(),
}
}
}
#[test]
fn retarget_nonexisting_jumps() {
let mut jmp_term = Term {
tid: Tid::new("jmp"),
term: Jmp::Branch(Tid::new("nonexisting_target")),
};
assert_eq!(jmp_term.term, Jmp::Branch(Tid::new("nonexisting_target")));
assert!(jmp_term
.retarget_nonexisting_jump_targets_to_dummy_tid(
&HashSet::new(),
&Tid::new("dummy_sub"),
&Tid::new("dummy_blk")
)
.is_err());
assert_eq!(jmp_term.term, Jmp::Branch(Tid::new("dummy_blk")));
}
#[test]
fn zero_extension_check() {
let eax_variable = Expression::Var(Variable {
name: String::from("EAX"),
size: ByteSize::new(4),
is_temp: false,
});
let int_sub_expr = Expression::BinOp {
op: BinOpType::IntSub,
lhs: Box::new(Expression::Var(Variable {
name: String::from("EAX"),
size: ByteSize::new(4),
is_temp: false,
})),
rhs: Box::new(Expression::Var(Variable {
name: String::from("ECX"),
size: ByteSize::new(4),
is_temp: false,
})),
};
let zero_extend_def = Term {
tid: Tid::new("zero_tid"),
term: Def::Assign {
var: Variable {
name: String::from("RAX"),
size: ByteSize::new(8),
is_temp: false,
},
value: Expression::Cast {
op: CastOpType::IntZExt,
size: ByteSize::new(8),
arg: Box::new(eax_variable.clone()),
},
},
};
// An expression that is a zero extension but does not directly contain a variable
let zero_extend_but_no_var_def = Term {
tid: Tid::new("zero_tid"),
term: Def::Assign {
var: Variable {
name: String::from("RAX"),
size: ByteSize::new(8),
is_temp: false,
},
value: Expression::Cast {
op: CastOpType::IntZExt,
size: ByteSize::new(8),
arg: Box::new(int_sub_expr.clone()),
},
},
};
let non_zero_extend_def = Term {
tid: Tid::new("zero_tid"),
term: Def::Assign {
var: Variable {
name: String::from("RAX"),
size: ByteSize::new(8),
is_temp: false,
},
value: Expression::Cast {
op: CastOpType::IntSExt,
size: ByteSize::new(8),
arg: Box::new(eax_variable.clone()),
},
},
};
assert_eq!(
zero_extend_def.check_for_zero_extension(String::from("RAX"), String::from("EAX")),
Some(Tid::new("zero_tid"))
);
assert_eq!(
zero_extend_but_no_var_def
.check_for_zero_extension(String::from("RAX"), String::from("EAX")),
None
);
assert_eq!(
non_zero_extend_def.check_for_zero_extension(String::from("RAX"), String::from("EAX")),
None
);
}
#[test]
fn expression_propagation() {
use crate::intermediate_representation::UnOpType;
let defs = vec![
Def::assign(
"tid_1",
Variable::mock("X", 8),
Expression::var("Y", 8).un_op(UnOpType::IntNegate),
),
Def::assign(
"tid_2",
Variable::mock("Y", 8),
Expression::var("X", 8).plus(Expression::var("Y", 8)),
),
Def::assign(
"tid_3",
Variable::mock("X", 8),
Expression::var("X", 8).un_op(UnOpType::IntNegate),
),
Def::assign(
"tid_4",
Variable::mock("Y", 8),
Expression::var("Y", 8).un_op(UnOpType::IntNegate),
),
Def::assign(
"tid_5",
Variable::mock("Y", 8),
Expression::var("X", 8).plus(Expression::var("Y", 8)),
),
];
let mut block = Term {
tid: Tid::new("block"),
term: Blk {
defs,
jmps: Vec::new(),
indirect_jmp_targets: Vec::new(),
},
};
block.merge_def_assignments_to_same_var();
block.propagate_input_expressions();
let result_defs = vec![
Def::assign(
"tid_1",
Variable::mock("X", 8),
Expression::var("Y", 8).un_op(UnOpType::IntNegate),
),
Def::assign(
"tid_2",
Variable::mock("Y", 8),
Expression::var("Y", 8)
.un_op(UnOpType::IntNegate)
.plus(Expression::var("Y", 8)),
),
Def::assign(
"tid_3",
Variable::mock("X", 8),
Expression::var("X", 8).un_op(UnOpType::IntNegate),
),
Def::assign(
"tid_5",
Variable::mock("Y", 8),
Expression::var("X", 8).plus(Expression::var("Y", 8).un_op(UnOpType::IntNegate)),
),
];
assert_eq!(block.term.defs, result_defs);
}
fn create_block_with_jump_target(block_name: &str, target_name: &str) -> Term<Blk> {
Term {
tid: Tid::new(block_name),
term: Blk {
defs: Vec::new(),
jmps: vec![Term {
tid: Tid::new(format!("jmp_{}", block_name)),
term: Jmp::Branch(Tid::new(target_name)),
}],
indirect_jmp_targets: Vec::new(),
},
}
}
fn create_sub_with_blocks(sub_name: &str, blocks: Vec<Term<Blk>>) -> Term<Sub> {
Term {
tid: Tid::new(sub_name),
term: Sub {
name: sub_name.to_string(),
blocks,
},
}
}
#[test]
fn duplication_of_blocks_contained_in_several_subs() {
let sub_1 = create_sub_with_blocks(
"sub_1",
vec![
create_block_with_jump_target("blk_1", "blk_2"),
create_block_with_jump_target("blk_2", "blk_1"),
],
);
let sub_2 = create_sub_with_blocks(
"sub_2",
vec![create_block_with_jump_target("blk_3", "blk_2")],
);
let sub_3 = create_sub_with_blocks(
"sub_3",
vec![create_block_with_jump_target("blk_4", "blk_3")],
);
let mut project = Project::mock_empty();
project.program.term.subs = vec![sub_1.clone(), sub_2, sub_3];
project.make_block_to_sub_mapping_unique();
assert_eq!(&project.program.term.subs[0], &sub_1);
let sub_2_modified = create_sub_with_blocks(
"sub_2",
vec![
create_block_with_jump_target("blk_3", "blk_2_sub_2"),
create_block_with_jump_target("blk_2_sub_2", "blk_1_sub_2"),
create_block_with_jump_target("blk_1_sub_2", "blk_2_sub_2"),
],
);
assert_eq!(project.program.term.subs[1].term.blocks.len(), 3);
assert_eq!(
&project.program.term.subs[1].term.blocks[0],
&sub_2_modified.term.blocks[0]
);
assert!(project.program.term.subs[1]
.term
.blocks
.contains(&sub_2_modified.term.blocks[1]));
assert!(project.program.term.subs[1]
.term
.blocks
.contains(&sub_2_modified.term.blocks[2]));
let sub_3_modified = create_sub_with_blocks(
"sub_3",
vec![
create_block_with_jump_target("blk_4", "blk_3_sub_3"),
create_block_with_jump_target("blk_3_sub_3", "blk_2_sub_3"),
create_block_with_jump_target("blk_2_sub_3", "blk_1_sub_3"),
create_block_with_jump_target("blk_1_sub_3", "blk_2_sub_3"),
],
);
assert_eq!(project.program.term.subs[2].term.blocks.len(), 4);
assert_eq!(
&project.program.term.subs[2].term.blocks[0],
&sub_3_modified.term.blocks[0]
);
assert!(project.program.term.subs[2]
.term
.blocks
.contains(&sub_3_modified.term.blocks[0]));
assert!(project.program.term.subs[2]
.term
.blocks
.contains(&sub_3_modified.term.blocks[1]));
assert!(project.program.term.subs[2]
.term
.blocks
.contains(&sub_3_modified.term.blocks[2]));
assert!(project.program.term.subs[2]
.term
.blocks
.contains(&sub_3_modified.term.blocks[3]));
}
}
......@@ -2,10 +2,10 @@
//! for different terms.
#[cfg(test)]
use crate::intermediate_representation::{Expression, Variable};
use crate::intermediate_representation::{Def, Expression, Jmp, Variable};
#[cfg(test)]
use super::{Def, Jmp, Term, Tid};
use super::{Term, Tid};
/// ## Helper functions for building defs
#[cfg(test)]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment