Commit 5343432e by Enkelmann Committed by Enkelmann

add documentation to the internal IR

parent 676e27a0
...@@ -20,6 +20,7 @@ use petgraph::graph::NodeIndex; ...@@ -20,6 +20,7 @@ use petgraph::graph::NodeIndex;
use petgraph::visit::IntoNodeReferences; use petgraph::visit::IntoNodeReferences;
use petgraph::Direction; use petgraph::Direction;
use std::collections::HashMap; use std::collections::HashMap;
use crate::prelude::*;
mod context; mod context;
mod object; mod object;
......
...@@ -2,28 +2,55 @@ use super::ByteSize; ...@@ -2,28 +2,55 @@ use super::ByteSize;
use super::Variable; use super::Variable;
use crate::prelude::*; use crate::prelude::*;
/// An expression is a calculation rule
/// on how to compute a certain value given some variables (register values) as input.
///
/// The basic building blocks of expressions are the same as for Ghidra P-Code.
/// However, expressions can be nested, unlike original P-Code.
///
/// Computing the value of an expression is a side-effect-free operation.
///
/// Expressions are typed in the sense that each expression has a `ByteSize`
/// indicating the size of the result when evaluating the expression.
/// Some expressions impose restrictions on the sizes of their inputs
/// for the expression to be well-typed.
///
/// All operations are defined the same as the corresponding P-Code operation.
/// Further information about specific operations can be obtained by looking up the P-Code mnemonics in the
/// [P-Code Reference Manual](https://ghidra.re/courses/languages/html/pcoderef.html).
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub enum Expression { pub enum Expression {
/// A variable representing a register or temporary value of known size.
Var(Variable), Var(Variable),
/// A constant value represented by a bitvector.
Const(Bitvector), Const(Bitvector),
/// A binary operation.
/// Note that most (but not all) operations require the left hand side (`lhs`)
/// and right hand side (`rhs`) to be of equal size.
BinOp { BinOp {
op: BinOpType, op: BinOpType,
lhs: Box<Expression>, lhs: Box<Expression>,
rhs: Box<Expression>, rhs: Box<Expression>,
}, },
/// A unary operation
UnOp { UnOp {
op: UnOpType, op: UnOpType,
arg: Box<Expression>, arg: Box<Expression>,
}, },
/// A cast operation for type cast between integer and floating point types of different byte lengths.
Cast { Cast {
op: CastOpType, op: CastOpType,
size: ByteSize, size: ByteSize,
arg: Box<Expression>, arg: Box<Expression>,
}, },
/// An unknown value but with known size.
/// This may be generated for e.g. unsupported assembly instructions.
/// Note that computation of an unknown value is still required to be side-effect-free!
Unknown { Unknown {
description: String, description: String,
size: ByteSize, size: ByteSize,
}, },
/// Extracting a sub-bitvector from the argument expression.
Subpiece { Subpiece {
low_byte: ByteSize, low_byte: ByteSize,
size: ByteSize, size: ByteSize,
...@@ -31,6 +58,7 @@ pub enum Expression { ...@@ -31,6 +58,7 @@ pub enum Expression {
}, },
} }
/// The type/mnemonic of a binary operation
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, Copy)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, Copy)]
pub enum BinOpType { pub enum BinOpType {
Piece, Piece,
...@@ -69,6 +97,7 @@ pub enum BinOpType { ...@@ -69,6 +97,7 @@ pub enum BinOpType {
FloatDiv, FloatDiv,
} }
/// The type/mnemonic of a typecast
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, Copy)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, Copy)]
pub enum CastOpType { pub enum CastOpType {
IntZExt, IntZExt,
...@@ -78,6 +107,7 @@ pub enum CastOpType { ...@@ -78,6 +107,7 @@ pub enum CastOpType {
Trunc, Trunc,
} }
/// The type/mnemonic of an unary operation
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, Copy)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, Copy)]
pub enum UnOpType { pub enum UnOpType {
IntNegate, IntNegate,
......
//! This module defines the intermediate representation used to represent a binary
//! and all its contained executable code.
//!
//! The main data structure is the `Project` struct,
//! which contains all information recovered about a binary during the disassembly step.
//! To learn how individual instructions are encoded,
//! you should first take a look at the `Expression` type and then at the `Def` and `Jmp` data types,
//! which form the basis of the basic block `Blk` struct.
use crate::prelude::*; use crate::prelude::*;
use derive_more::*; use derive_more::*;
use std::convert::TryFrom; use std::convert::TryFrom;
...@@ -9,7 +18,11 @@ pub use expression::*; ...@@ -9,7 +18,11 @@ pub use expression::*;
mod term; mod term;
pub use term::*; pub use term::*;
// TODO: move ByteSize and BitSize into their own module /// An unsigned number of bytes.
///
/// Used to represent sizes of values in registers or in memory.
/// Can also be used for other byte-valued numbers, like offsets,
/// as long as the number is guaranteed to be non-negative.
#[derive( #[derive(
Serialize, Serialize,
Deserialize, Deserialize,
......
use super::{ByteSize, Expression, Variable}; use super::{ByteSize, Expression, Variable};
use crate::prelude::*; use crate::prelude::*;
use crate::term::{Term, Tid};
/// A term identifier consisting of an ID string (which is required to be unique)
/// and an address to indicate where the term is located.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, PartialOrd, Ord)]
pub struct Tid {
/// The unique ID of the term.
id: String,
/// The address where the term is located.
pub address: String,
}
impl Tid {
/// Generate a new term identifier with the given ID string
/// and with unknown address.
pub fn new<T: ToString>(val: T) -> Tid {
Tid {
id: val.to_string(),
address: "UNKNOWN".to_string(),
}
}
/// Add a suffix to the ID string and return the new `Tid`
pub fn with_id_suffix(self, suffix: &str) -> Self {
Tid {
id: self.id + suffix,
address: self.address,
}
}
}
impl std::fmt::Display for Tid {
fn fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(formatter, "{}", self.id)
}
}
/// A term is an object inside a binary with an address and an unique ID (both contained in the `tid`).
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Term<T> {
/// The term identifier, which also contains the address of the term
pub tid: Tid,
/// The object
pub term: T,
}
/// A side-effectful operation.
/// Can be a register assignment or a memory load/store operation.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub enum Def { pub enum Def {
/// A memory load into the register given by `var`.
///
/// The size of `var` also determines the number of bytes read from memory.
/// The size of `address` is required to match the pointer size of the corresponding CPU architecture.
Load { Load {
var: Variable, var: Variable,
address: Expression, address: Expression,
}, },
/// A memory store operation.
///
/// The size of `value` determines the number of bytes written.
/// The size of `address` is required to match the pointer size of the corresponding CPU architecture.
Store { Store {
address: Expression, address: Expression,
value: Expression, value: Expression,
}, },
/// A register assignment, assigning the result of the expression `value` to the register `var`.
Assign { Assign {
var: Variable, var: Variable,
value: Expression, value: Expression,
}, },
} }
/// A `Jmp` instruction affects the control flow of a program, i.e. it may change the instruction pointer.
/// With the exception of `CallOther`, it has no other side effects.
///
/// `Jmp` instructions carry some semantic information with it, like whether a jump is intra- or interprocedural.
/// Note that this semantic information may not always be correct.
///
/// The targets (and return targets) of jumps are, if known, either basic blocks (`Blk`) or subroutines (`Sub`)
/// depending of the type of the jump.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub enum Jmp { pub enum Jmp {
/// A direct intraprocedural jump to the targeted `Blk` term identifier.
Branch(Tid), Branch(Tid),
/// An indirect intraprocedural jump to the address that the given expression evaluates to.
BranchInd(Expression), BranchInd(Expression),
/// A direct intraprocedural jump that is only taken if the condition evaluates to true (i.e. not zero).
CBranch { CBranch {
target: Tid, target: Tid,
condition: Expression, condition: Expression,
}, },
/// A direct interprocedural jump representing a subroutine call.
///
/// Note that this is syntactically equivalent to a `Jmp::Branch`.
/// If the `return_` is `None`, then the called function does not return to its caller.
Call { Call {
target: Tid, target: Tid,
return_: Option<Tid>, return_: Option<Tid>,
}, },
/// An indirect interprocedural jump to the address the `target` expression evaluates to
/// and representing a subroutine call.
///
/// Note that this is syntactically equivalent to a `Jmp::BranchInd`.
/// If the `return_` is `None`, then the called function is believed to not return to its caller.
CallInd { CallInd {
target: Expression, target: Expression,
return_: Option<Tid>, return_: Option<Tid>,
}, },
/// A indirect interprocedural jump indicating a return from a subroutine.
///
/// Note that this is syntactically equivalent to a `Jmp::BranchInd`.
Return(Expression), Return(Expression),
/// This instruction is used for all side effects that are not representable by other instructions
/// or not supported by the disassembler.
///
/// E.g. syscalls and other interrupts are mapped to `CallOther`.
/// Assembly instructions that the disassembler does not support are also mapped to `CallOther`.
/// One can use the `description` field to match for and handle known side effects (e.g. syscalls).
///
/// The `return_` field indicates the `Blk` term identifier
/// where the disassembler assumes that execution will continue after handling of the side effect.
CallOther { CallOther {
description: String, description: String,
return_: Option<Tid>, return_: Option<Tid>,
}, },
} }
/// A basic block is a sequence of `Def` instructions followed by up to two `Jmp` instructions.
///
/// The `Def` instructions represent side-effectful operations that are executed in order when the block is entered.
/// `Def` instructions do not affect the control flow of a program.
///
/// The `Jmp` instructions represent control flow affecting operations.
/// There can only be zero, one or two `Jmp`s:
/// - Zero `Jmp`s indicate that the next execution to be executed could not be discerned.
/// This should only happen on disassembler errors or on dead ends in the control flow graph that were deliberately inserted by the user.
/// - If there is exactly one `Jmp`, it is required to be an unconditional jump.
/// - For two jumps, the first one has to be a conditional jump,
/// where the second unconditional jump is only taken if the condition of the first jump evaluates to false.
///
/// Basic blocks are *single entry, single exit*, i.e. a basic block is only entered at the beginning
/// and is only exited by the jump instructions at the end of the block.
/// If a new control flow edge is discovered that would jump to the middle of a basic block,
/// the block structure needs to be updated accordingly.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Blk { pub struct Blk {
pub defs: Vec<Term<Def>>, pub defs: Vec<Term<Def>>,
pub jmps: Vec<Term<Jmp>>, pub jmps: Vec<Term<Jmp>>,
} }
/// A `Sub` or subroutine represents a function with a given name and a list of basic blocks belonging to it.
///
/// Subroutines are *single-entry*,
/// i.e. calling a subroutine will execute the first block in the list of basic blocks.
/// A subroutine may have multiple exits, which are identified by `Jmp::Return` instructions.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Sub { pub struct Sub {
/// The name of the subroutine
pub name: String, pub name: String,
/// The basic blocks belonging to the subroutine.
/// The first block is also the entry point of the subroutine.
pub blocks: Vec<Term<Blk>>, pub blocks: Vec<Term<Blk>>,
} }
/// A parameter or return argument of a function.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub enum Arg { pub enum Arg {
/// The argument is passed in a register
Register(Variable), Register(Variable),
/// The argument is passed on the stack.
/// It is positioned at the given offset (in bytes) relative to the stack pointer on function entry
/// and has the given size.
Stack { offset: i64, size: ByteSize }, Stack { offset: i64, size: ByteSize },
} }
/// An extern symbol represents a funtion that is dynamically linked from another binary.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct ExternSymbol { pub struct ExternSymbol {
pub tid: Tid, pub tid: Tid,
/// The name of the extern symbol
pub name: String, pub name: String,
/// The calling convention used for the extern symbol if known
pub calling_convention: Option<String>, pub calling_convention: Option<String>,
/// Parameters of an extern symbol.
/// May be empty if there are no parameters or the parameters are unknown.
pub parameters: Vec<Arg>, pub parameters: Vec<Arg>,
/// Return values of an extern symbol.
/// May be empty if there is no return value or the return values are unknown.
pub return_values: Vec<Arg>, pub return_values: Vec<Arg>,
/// If set to `true`, the function is assumed to never return to its caller when called.
pub no_return: bool, pub no_return: bool,
} }
/// The `Program` structure represents a disassembled binary.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Program { pub struct Program {
/// The known functions contained in the binary
pub subs: Vec<Term<Sub>>, pub subs: Vec<Term<Sub>>,
/// Extern symbols linked to the binary by the linker.
pub extern_symbols: Vec<ExternSymbol>, pub extern_symbols: Vec<ExternSymbol>,
/// Entry points into to binary,
/// i.e. the term identifiers of functions that may be called from outside of the binary.
pub entry_points: Vec<Tid>, pub entry_points: Vec<Tid>,
} }
/// The `Project` struct is the main data structure representing a binary.
///
/// It contains information about the disassembled binary
/// and about the execution environment of the binary.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Project { pub struct Project {
/// All (known) executable code of the binary is contained in the `program` term.
pub program: Term<Program>, pub program: Term<Program>,
/// The CPU architecture on which the binary is assumed to be executed.
pub cpu_architecture: String, pub cpu_architecture: String,
/// The stack pointer register for the given CPU architecture.
pub stack_pointer_register: Variable, pub stack_pointer_register: Variable,
} }
use super::ByteSize; use super::ByteSize;
use crate::prelude::*; use crate::prelude::*;
/// A variable represents a register with a known size and name.
///
/// Variables can be temporary (or virtual).
/// In this case they do not represent actual physical registers
/// and are only used to store intermediate results necessary for representing more complex assembly instructions.
/// Temporary variables are only valid until the end of the current assembly instruction.
/// However, one assembly instruction may span more than one basic block in the intermediate representation
/// (but never more than one function).
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Variable { pub struct Variable {
pub name: String, pub name: String,
......
...@@ -21,6 +21,6 @@ mod prelude { ...@@ -21,6 +21,6 @@ mod prelude {
pub use serde::{Deserialize, Serialize}; pub use serde::{Deserialize, Serialize};
pub use crate::bil::{BitSize, Bitvector}; pub use crate::bil::{BitSize, Bitvector};
pub use crate::term::Tid; pub use crate::intermediate_representation::{Term, Tid};
pub use anyhow::{anyhow, Error}; pub use anyhow::{anyhow, Error};
} }
...@@ -10,7 +10,6 @@ use crate::intermediate_representation::Program as IrProgram; ...@@ -10,7 +10,6 @@ use crate::intermediate_representation::Program as IrProgram;
use crate::intermediate_representation::Project as IrProject; use crate::intermediate_representation::Project as IrProject;
use crate::intermediate_representation::Sub as IrSub; use crate::intermediate_representation::Sub as IrSub;
use crate::prelude::*; use crate::prelude::*;
use crate::term::{Term, Tid};
// TODO: Handle the case where an indirect tail call is represented by CALLIND plus RETURN // TODO: Handle the case where an indirect tail call is represented by CALLIND plus RETURN
......
...@@ -8,45 +8,11 @@ use crate::intermediate_representation::Program as IrProgram; ...@@ -8,45 +8,11 @@ use crate::intermediate_representation::Program as IrProgram;
use crate::intermediate_representation::Project as IrProject; use crate::intermediate_representation::Project as IrProject;
use crate::intermediate_representation::Sub as IrSub; use crate::intermediate_representation::Sub as IrSub;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::intermediate_representation::{Term, Tid};
pub mod symbol; pub mod symbol;
use symbol::ExternSymbol; use symbol::ExternSymbol;
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone, PartialOrd, Ord)]
pub struct Tid {
id: String,
pub address: String,
}
impl Tid {
pub fn new<T: ToString>(val: T) -> Tid {
Tid {
id: val.to_string(),
address: "UNKNOWN".to_string(),
}
}
/// Add a suffix to the ID string and return the new `Tid`
pub fn with_id_suffix(self, suffix: &str) -> Self {
Tid {
id: self.id + suffix,
address: self.address,
}
}
}
impl std::fmt::Display for Tid {
fn fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(formatter, "{}", self.id)
}
}
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Term<T> {
pub tid: Tid,
pub term: T,
}
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)] #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash, Clone)]
pub struct Def { pub struct Def {
pub lhs: Variable, pub lhs: Variable,
...@@ -241,10 +207,7 @@ impl From<Blk> for IrBlk { ...@@ -241,10 +207,7 @@ impl From<Blk> for IrBlk {
} else { } else {
for (counter, ir_def) in ir_defs.into_iter().enumerate() { for (counter, ir_def) in ir_defs.into_iter().enumerate() {
ir_def_terms.push(Term { ir_def_terms.push(Term {
tid: Tid { tid: def_term.tid.clone().with_id_suffix(&format!("_{}", counter)),
id: format!("{}_{}", def_term.tid.id, counter),
address: def_term.tid.address.clone(),
},
term: ir_def, term: ir_def,
}); });
} }
...@@ -261,10 +224,7 @@ impl From<Blk> for IrBlk { ...@@ -261,10 +224,7 @@ impl From<Blk> for IrBlk {
} }
for (counter, ir_def) in ir_defs.into_iter().enumerate() { for (counter, ir_def) in ir_defs.into_iter().enumerate() {
ir_def_terms.push(Term { ir_def_terms.push(Term {
tid: Tid { tid: jmp_term.tid.clone().with_id_suffix(&format!("_{}", counter)),
id: format!("{}_{}", jmp_term.tid.id, counter),
address: jmp_term.tid.address.clone(),
},
term: ir_def, term: ir_def,
}); });
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment