Generate call trace information for CWE-119 check (#365)

1ebd9bcb · Enkelmann · GitHub · 9ad700d7 · 1ebd9bcb · 1ebd9bcb
Unverified Commit 1ebd9bcb authored Dec 14, 2022 by Enkelmann Committed by GitHub Dec 14, 2022
4 changed files
--- a/src/cwe_checker_lib/src/analysis/callgraph.rs
+++ b/src/cwe_checker_lib/src/analysis/callgraph.rs
 //! Generate call graphs out of a program term.

-use std::collections::HashMap;
-
 use crate::intermediate_representation::*;
-use petgraph::graph::DiGraph;
+use petgraph::{graph::DiGraph, graph::NodeIndex, visit::EdgeRef};
+use std::collections::{BTreeSet, HashMap};

 /// The graph type of a call graph
 pub type CallGraph<'a> = DiGraph<Tid, &'a Term<Jmp>>;
@@ -40,26 +39,105 @@ pub fn get_program_callgraph(program: &Term<Program>) -> CallGraph {
    callgraph
 }

+/// Collect and return all call TIDs of call sequences that start in the function given by the `source_sub_tid`
+/// and end in the function given by the `target_sub_tid`.
+pub fn find_call_sequences_to_target(
+    callgraph: &CallGraph,
+    source_sub_tid: &Tid,
+    target_sub_tid: &Tid,
+) -> BTreeSet<Tid> {
+    let source_node = callgraph
+        .node_indices()
+        .find(|node| callgraph[*node] == *source_sub_tid)
+        .unwrap_or_else(|| panic!("Function TID not found in call graph."));
+    find_call_sequences_from_node_to_target(callgraph, source_node, target_sub_tid, BTreeSet::new())
+}
+
+/// Recursively collects all call TIDs of call sequences that start in the function given by the `source_node` in the call graph
+/// and end in the function given by the `target_sub_tid`.
+fn find_call_sequences_from_node_to_target(
+    callgraph: &CallGraph,
+    source_node: NodeIndex,
+    target_sub_tid: &Tid,
+    visited_nodes: BTreeSet<NodeIndex>,
+) -> BTreeSet<Tid> {
+    let mut call_tids = BTreeSet::new();
+    for edge in callgraph.edges_directed(source_node, petgraph::Direction::Outgoing) {
+        let (_, target_node) = callgraph.edge_endpoints(edge.id()).unwrap();
+        if callgraph[target_node] == *target_sub_tid {
+            call_tids.insert(edge.weight().tid.clone());
+        } else if !visited_nodes.contains(&target_node) {
+            let mut recursive_visited = visited_nodes.clone();
+            recursive_visited.insert(target_node);
+            let recursive_tids = find_call_sequences_from_node_to_target(
+                callgraph,
+                target_node,
+                target_sub_tid,
+                recursive_visited,
+            );
+            if !recursive_tids.is_empty() {
+                call_tids.extend(recursive_tids.into_iter());
+                call_tids.insert(edge.weight().tid.clone());
+            }
+        }
+    }
+    call_tids
+}
+
 #[cfg(test)]
 pub mod tests {
    use super::*;
+    use std::collections::BTreeMap;
+
+    /// Mock a function with calls to the given list of Sub-TIDs.
+    /// Each call gets a unique ID, so that the edges in the call graph will be distinguishable.
+    fn mock_sub_with_calls(sub_tid: &str, call_targets: &[&str]) -> Term<Sub> {
+        let mut sub = Sub::mock(sub_tid);
+        for (i, target) in call_targets.iter().enumerate() {
+            let call = Jmp::Call {
+                target: Tid::new(target),
+                return_: None,
+            };
+            let mut block = Blk::mock();
+            block.term.jmps.push(Term {
+                tid: Tid::new(format!("{}_call_{}_{}", sub_tid, target, i)),
+                term: call,
+            });
+            sub.term.blocks.push(block);
+        }
+        sub
+    }
+
+    #[test]
+    fn test_find_call_sequences_to_target() {
+        let mut project = Project::mock_x64();
+        let sub1 = mock_sub_with_calls("sub1", &["sub2", "sub2"]);
+        let sub2 = mock_sub_with_calls("sub2", &["sub3", "sub4"]);
+        let sub3 = mock_sub_with_calls("sub3", &[]);
+        let sub4 = mock_sub_with_calls("sub4", &[]);
+        project.program.term.subs = BTreeMap::from([
+            (Tid::new("sub1"), sub1),
+            (Tid::new("sub2"), sub2),
+            (Tid::new("sub3"), sub3),
+            (Tid::new("sub4"), sub4),
+        ]);
+        let callgraph = get_program_callgraph(&project.program);
+        let call_tids =
+            find_call_sequences_to_target(&callgraph, &Tid::new("sub1"), &Tid::new("sub3"));
+        let call_tids: Vec<_> = call_tids.iter().map(|tid| format!("{}", tid)).collect();
+        assert_eq!(call_tids.len(), 3);
+        // Note that the order of elements is important in the sense that it needs to be deterministic.
+        assert_eq!(&call_tids[0], "sub1_call_sub2_0");
+        assert_eq!(&call_tids[1], "sub1_call_sub2_1");
+        assert_eq!(&call_tids[2], "sub2_call_sub3_0");
+    }

    #[test]
    fn test_get_program_callgraph() {
        // Create a program with 2 functions and one call between them
        let mut project = Project::mock_x64();
-        let mut caller = Sub::mock("caller");
-        let callee = Sub::mock("callee");
-        let call = Jmp::Call {
-            target: Tid::new("callee"),
-            return_: None,
-        };
-        let mut call_block = Blk::mock();
-        call_block.term.jmps.push(Term {
-            tid: Tid::new("call"),
-            term: call,
-        });
-        caller.term.blocks.push(call_block);
+        let caller = mock_sub_with_calls("caller", &["callee"]);
+        let callee = mock_sub_with_calls("callee", &[]);
        project.program.term.subs.insert(Tid::new("caller"), caller);
        project.program.term.subs.insert(Tid::new("callee"), callee);
        // Test correctness of the call graph

--- a/src/cwe_checker_lib/src/checkers/cwe_119/context/mod.rs
+++ b/src/cwe_checker_lib/src/checkers/cwe_119/context/mod.rs
 use crate::abstract_domain::*;
+use crate::analysis::callgraph::CallGraph;
 use crate::analysis::function_signature::FunctionSignature;
 use crate::analysis::graph::Graph;
 use crate::analysis::pointer_inference::{Data, PointerInference};
@@ -38,6 +39,8 @@ pub struct Context<'a> {
    pub malloc_tid_to_object_size_map: HashMap<Tid, Data>,
    /// A map that maps the TIDs of jump instructions to the function TID of the caller.
    pub call_to_caller_fn_map: HashMap<Tid, Tid>,
+    /// The callgraph corresponding to the project.
+    pub callgraph: CallGraph<'a>,
    /// A sender channel that can be used to collect logs in the corresponding logging thread.
    pub log_collector: crossbeam_channel::Sender<LogThreadMsg>,
 }
@@ -52,6 +55,7 @@ impl<'a> Context<'a> {
        'a: 'b,
    {
        let project = analysis_results.project;
+        let callgraph = crate::analysis::callgraph::get_program_callgraph(&project.program);
        Context {
            project,
            graph: analysis_results.control_flow_graph,
@@ -63,6 +67,7 @@ impl<'a> Context<'a> {
            ),
            malloc_tid_to_object_size_map: compute_size_values_of_malloc_calls(analysis_results),
            call_to_caller_fn_map: compute_call_to_caller_map(project),
+            callgraph,
            log_collector,
        }
    }

--- a/src/cwe_checker_lib/src/checkers/cwe_119/state.rs
+++ b/src/cwe_checker_lib/src/checkers/cwe_119/state.rs
@@ -87,6 +87,18 @@ impl State {
                        ) = context.compute_bounds_of_id(id, &self.stack_id)
                        {
                            out_of_bounds_access_warnings.push(format!("The object bound is based on the possible source value {:#} for the object ID.", source.to_json_compact()));
+                            let call_sequence_tids = collect_tids_for_cwe_warning(
+                                source.get_if_unique_target().unwrap().0,
+                                self,
+                                context,
+                            );
+                            out_of_bounds_access_warnings
+                                .push(format!("Relevant callgraph TIDs: [{}]", call_sequence_tids));
+                        } else {
+                            out_of_bounds_access_warnings.push(format!(
+                                "Relevant callgraph TIDs: [{}]",
+                                self.stack_id.get_tid()
+                            ));
                        }
                        // Replace the bound with `Top` to prevent duplicate CWE warnings with the same root cause.
                        self.object_lower_bounds
@@ -110,6 +122,18 @@ impl State {
                        ) = context.compute_bounds_of_id(id, &self.stack_id)
                        {
                            out_of_bounds_access_warnings.push(format!("The object bound is based on the possible source value {:#} for the object ID.", source.to_json_compact()));
+                            let call_sequence_tids = collect_tids_for_cwe_warning(
+                                source.get_if_unique_target().unwrap().0,
+                                self,
+                                context,
+                            );
+                            out_of_bounds_access_warnings
+                                .push(format!("Relevant callgraph TIDs: [{}]", call_sequence_tids));
+                        } else {
+                            out_of_bounds_access_warnings.push(format!(
+                                "Relevant callgraph TIDs: [{}]",
+                                self.stack_id.get_tid()
+                            ));
                        }
                        // Replace the bound with `Top` to prevent duplicate CWE warnings with the same root cause.
                        self.object_upper_bounds
@@ -194,6 +218,58 @@ impl State {
    }
 }

+/// Collect all relevant call sequence TIDs corresponding to a CWE warning.
+/// This includes:
+/// - The TID of a root function from which both the allocation site and the site of the CWE warning can be reached
+/// - All call TID that are relevant for reaching the allocation site from the root function.
+/// - All call TIDs that are relevant for reachting the site of the CWE warning.
+///   This list is complete in the sense that all possible paths in the call graph from the root function to the CWE warning site
+///   are covered by these calls.
+///
+/// The resulting list is returned as a string,
+/// as it is currently only used for human-readable context information in the CWE warnings.
+fn collect_tids_for_cwe_warning(
+    id: &AbstractIdentifier,
+    state: &State,
+    context: &Context,
+) -> String {
+    use crate::analysis::callgraph::find_call_sequences_to_target;
+    let caller_tid = if context.project.program.term.subs.contains_key(id.get_tid()) {
+        // The ID is the stack ID of some function.
+        id.get_tid().clone()
+    } else {
+        // The ID corresponds to a malloc-like call
+        let root_call_tid = if let Some(root_call) = id.get_path_hints().last() {
+            root_call
+        } else {
+            id.get_tid()
+        };
+        context
+            .project
+            .program
+            .term
+            .find_sub_containing_jump(root_call_tid)
+            .expect("Caller corresponding to call does not exist.")
+    };
+    let mut tids = Vec::new();
+    tids.push(caller_tid.clone());
+    tids.extend(id.get_path_hints().iter().cloned());
+    if caller_tid != *state.stack_id.get_tid() {
+        // We also need the possible call sequences from the caller to the current function
+        let call_sequence_tids = find_call_sequences_to_target(
+            &context.callgraph,
+            &caller_tid,
+            state.stack_id.get_tid(),
+        );
+        tids.extend(call_sequence_tids.into_iter());
+    }
+    // Build a string out of the TID list
+    tids.iter()
+        .map(|tid| format!("{}", tid))
+        .reduce(|accum, elem| format!("{}, {}", accum, elem))
+        .unwrap()
+}
+
 #[cfg(test)]
 pub mod tests {
    use super::*;
@@ -242,7 +318,7 @@ pub mod tests {
            state
                .check_address_access(&address, ByteSize::new(8), &context)
                .len(),
-            1
+            2
        );
        // subsequent errors are suppressed
        let address = Data::from_target(stack_id, Bitvector::from_i64(8).into());

--- a/src/cwe_checker_lib/src/intermediate_representation/program.rs
+++ b/src/cwe_checker_lib/src/intermediate_representation/program.rs
@@ -32,6 +32,22 @@ impl Program {
            .flat_map(|(_, sub)| sub.term.blocks.iter())
            .find(|block| block.tid == *tid)
    }
+
+    /// Find the sub containing a specific jump instruction (including call instructions).
+    /// WARNING: The function simply iterates though all blocks,
+    /// i.e. it is very inefficient for large projects!
+    pub fn find_sub_containing_jump(&self, jmp_tid: &Tid) -> Option<Tid> {
+        for sub in self.subs.values() {
+            for blk in &sub.term.blocks {
+                for jmp in &blk.term.jmps {
+                    if &jmp.tid == jmp_tid {
+                        return Some(sub.tid.clone());
+                    }
+                }
+            }
+        }
+        None
+    }
 }

 #[cfg(test)]