src/filter/dsl/parser/lexer.rs

//! Tokenizer for the filter DSL.
//!
//! Hand-written rather than generated because the grammar is tiny and a clean
//! manual implementation is easier to debug than a macro expansion.

use crate::filter::dsl::error::{DslError, DslResult};

/// A single lexical token.
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
    /// Bare identifier, e.g. `status`, `body`, `host`.
    Ident(String),
    /// Quoted or unquoted string literal.
    Str(String),
    /// Integer literal.
    Int(i64),
    /// Regex literal delimited by `/.../`.
    Regex(String),
    Eq,
    Ne,
    Lt,
    Le,
    Gt,
    Ge,
    Tilde,
    LParen,
    RParen,
    And,
    Or,
    Not,
}

/// A token with its source column, used for error messages.
#[derive(Debug, Clone)]
pub struct Spanned {
    pub token: Token,
    pub column: usize,
}

pub fn tokenize(input: &str) -> DslResult<Vec<Spanned>> {
    let mut out = Vec::new();
    let bytes = input.as_bytes();
    let mut i = 0;
    while i < bytes.len() {
        let start = i + 1;
        let c = bytes[i];
        match c {
            b' ' | b'\t' | b'\n' => {
                i += 1;
            }
            b'(' => {
                out.push(Spanned { token: Token::LParen, column: start });
                i += 1;
            }
            b')' => {
                out.push(Spanned { token: Token::RParen, column: start });
                i += 1;
            }
            b'~' => {
                out.push(Spanned { token: Token::Tilde, column: start });
                i += 1;
            }
            b'=' => {
                out.push(Spanned { token: Token::Eq, column: start });
                i += 1;
            }
            b'!' if peek(bytes, i + 1) == Some(b'=') => {
                out.push(Spanned { token: Token::Ne, column: start });
                i += 2;
            }
            b'<' if peek(bytes, i + 1) == Some(b'=') => {
                out.push(Spanned { token: Token::Le, column: start });
                i += 2;
            }
            b'<' => {
                out.push(Spanned { token: Token::Lt, column: start });
                i += 1;
            }
            b'>' if peek(bytes, i + 1) == Some(b'=') => {
                out.push(Spanned { token: Token::Ge, column: start });
                i += 2;
            }
            b'>' => {
                out.push(Spanned { token: Token::Gt, column: start });
                i += 1;
            }
            b'"' => {
                let (literal, consumed) = read_string(&bytes[i..], start)?;
                out.push(Spanned { token: Token::Str(literal), column: start });
                i += consumed;
            }
            b'/' => {
                let (literal, consumed) = read_regex(&bytes[i..], start)?;
                out.push(Spanned { token: Token::Regex(literal), column: start });
                i += consumed;
            }
            b'0'..=b'9' | b'-' => {
                let (value, consumed) = read_int(&bytes[i..], start)?;
                out.push(Spanned { token: Token::Int(value), column: start });
                i += consumed;
            }
            c if is_ident_start(c) => {
                let (word, consumed) = read_ident(&bytes[i..]);
                let token = match word.as_str() {
                    "and" => Token::And,
                    "or" => Token::Or,
                    "not" => Token::Not,
                    _ => Token::Ident(word),
                };
                out.push(Spanned { token, column: start });
                i += consumed;
            }
            other => {
                return Err(DslError::Syntax {
                    column: start,
                    message: format!("unexpected byte {:?}", other as char),
                });
            }
        }
    }
    Ok(out)
}

fn peek(bytes: &[u8], i: usize) -> Option<u8> {
    bytes.get(i).copied()
}

fn is_ident_start(c: u8) -> bool {
    c.is_ascii_alphabetic() || c == b'_'
}

fn is_ident_cont(c: u8) -> bool {
    is_ident_start(c) || c.is_ascii_digit() || c == b'.' || c == b'-'
}

fn read_ident(bytes: &[u8]) -> (String, usize) {
    let mut end = 0;
    while end < bytes.len() && is_ident_cont(bytes[end]) {
        end += 1;
    }
    (String::from_utf8_lossy(&bytes[..end]).into_owned(), end)
}

fn read_string(bytes: &[u8], column: usize) -> DslResult<(String, usize)> {
    let mut out = String::new();
    let mut i = 1;
    while i < bytes.len() {
        match bytes[i] {
            b'"' => return Ok((out, i + 1)),
            b'\\' if i + 1 < bytes.len() => {
                out.push(bytes[i + 1] as char);
                i += 2;
            }
            c => {
                out.push(c as char);
                i += 1;
            }
        }
    }
    Err(DslError::Syntax {
        column,
        message: "unterminated string literal".into(),
    })
}

fn read_regex(bytes: &[u8], column: usize) -> DslResult<(String, usize)> {
    let mut i = 1;
    let mut out = String::new();
    while i < bytes.len() {
        match bytes[i] {
            b'/' => return Ok((out, i + 1)),
            b'\\' if i + 1 < bytes.len() => {
                out.push('\\');
                out.push(bytes[i + 1] as char);
                i += 2;
            }
            c => {
                out.push(c as char);
                i += 1;
            }
        }
    }
    Err(DslError::Syntax {
        column,
        message: "unterminated regex literal".into(),
    })
}

fn read_int(bytes: &[u8], column: usize) -> DslResult<(i64, usize)> {
    let mut end = if bytes[0] == b'-' { 1 } else { 0 };
    while end < bytes.len() && bytes[end].is_ascii_digit() {
        end += 1;
    }
    if end == 0 || (end == 1 && bytes[0] == b'-') {
        return Err(DslError::Syntax {
            column,
            message: "expected digit".into(),
        });
    }
    let raw = std::str::from_utf8(&bytes[..end]).unwrap();
    let value = raw.parse::<i64>().map_err(|_| DslError::Syntax {
        column,
        message: "integer literal out of range".into(),
    })?;
    Ok((value, end))
}