src/filter.rs

//! Compile user-supplied regex rules and classify lines.
//!
//! A [`FilterSet`] combines three things:
//!
//! 1. Zero or more *include* patterns. If any are set, a line must match at
//!    least one of them to pass.
//! 2. Zero or more *exclude* patterns. Any match causes the line to be dropped.
//! 3. An optional *extract* pattern with named capture groups. When present,
//!    matching lines produce a [`MatchOutcome::Fields`] with ordered fields.

use std::collections::BTreeMap;

use anyhow::{anyhow, Result};
use regex::{Regex, RegexSet};

/// Result of applying a [`FilterSet`] to a single line.
#[derive(Debug, Clone)]
pub enum MatchOutcome {
    /// Line passed without structured extraction.
    Plain,
    /// Line matched the extract pattern; named captures are preserved in order.
    Fields(Vec<(String, String)>),
}

#[derive(Debug)]
pub struct FilterSet {
    includes: Option<RegexSet>,
    excludes: Option<RegexSet>,
    extract: Option<Regex>,
    field_order: Vec<String>,
}

impl FilterSet {
    /// Compile a set of include, exclude, and extract patterns.
    ///
    /// Each string is compiled independently. The returned [`FilterSet`] can
    /// be applied to lines with [`FilterSet::apply`].
    pub fn compile(
        includes: &[String],
        excludes: &[String],
        extract: Option<&str>,
    ) -> Result<Self> {
        let includes = if includes.is_empty() {
            None
        } else {
            Some(RegexSet::new(includes).map_err(|e| anyhow!("invalid --match pattern: {e}"))?)
        };
        let excludes = if excludes.is_empty() {
            None
        } else {
            Some(RegexSet::new(excludes).map_err(|e| anyhow!("invalid --exclude pattern: {e}"))?)
        };

        let (extract, field_order) = match extract {
            Some(p) => {
                let re = Regex::new(p).map_err(|e| anyhow!("invalid --extract pattern: {e}"))?;
                let order: Vec<String> = re
                    .capture_names()
                    .flatten()
                    .map(|s| s.to_string())
                    .collect();
                if order.is_empty() {
                    return Err(anyhow!(
                        "--extract pattern must contain at least one named capture group"
                    ));
                }
                (Some(re), order)
            }
            None => (None, Vec::new()),
        };

        Ok(Self { includes, excludes, extract, field_order })
    }

    /// Apply the rules to a line. Returns `None` if the line is filtered out.
    pub fn apply(&self, line: &str) -> Option<MatchOutcome> {
        if let Some(excl) = &self.excludes {
            if excl.is_match(line) {
                return None;
            }
        }
        if let Some(incl) = &self.includes {
            if !incl.is_match(line) {
                return None;
            }
        }

        if let Some(re) = &self.extract {
            match re.captures(line) {
                Some(caps) => {
                    let mut fields = Vec::with_capacity(self.field_order.len());
                    for name in &self.field_order {
                        let val = caps.name(name).map(|m| m.as_str()).unwrap_or("").to_string();
                        fields.push((name.clone(), val));
                    }
                    Some(MatchOutcome::Fields(fields))
                }
                None => {
                    // Still emit the line in plain form; the renderer handles
                    // the mixed stream shape.
                    Some(MatchOutcome::Plain)
                }
            }
        } else {
            Some(MatchOutcome::Plain)
        }
    }

    /// Field names in the order they appear in the extract regex. Used by the
    /// renderer to align tabular output.
    pub fn field_order(&self) -> &[String] {
        &self.field_order
    }

    /// Returns a rough cost estimate for the filter, for test snapshots.
    #[doc(hidden)]
    pub fn cost_estimate(&self) -> BTreeMap<&'static str, usize> {
        let mut m = BTreeMap::new();
        m.insert("includes", self.includes.as_ref().map(|r| r.len()).unwrap_or(0));
        m.insert("excludes", self.excludes.as_ref().map(|r| r.len()).unwrap_or(0));
        m.insert("fields", self.field_order.len());
        m
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn include_then_exclude() {
        let fs = FilterSet::compile(
            &["ERROR".into()],
            &["healthz".into()],
            None,
        )
        .unwrap();
        assert!(matches!(fs.apply("ERROR: disk full"), Some(MatchOutcome::Plain)));
        assert!(fs.apply("INFO: fine").is_none());
        assert!(fs.apply("ERROR: healthz failed").is_none());
    }

    #[test]
    fn extract_named_captures_in_order() {
        let fs = FilterSet::compile(
            &[],
            &[],
            Some(r"rid=(?P<rid>\w+) ms=(?P<ms>\d+)"),
        )
        .unwrap();
        let line = "t=5 rid=abc123 ms=47 path=/x";
        match fs.apply(line).unwrap() {
            MatchOutcome::Fields(fs) => {
                assert_eq!(fs, vec![
                    ("rid".into(), "abc123".into()),
                    ("ms".into(), "47".into()),
                ]);
            }
            other => panic!("unexpected: {other:?}"),
        }
    }

    #[test]
    fn extract_must_have_named_groups() {
        let err = FilterSet::compile(&[], &[], Some(r"\d+")).unwrap_err();
        assert!(err.to_string().contains("named capture"));
    }
}