//! Tokenizer for the filter DSL.
//!
//! Hand-written rather than generated because the grammar is tiny and a clean
//! manual implementation is easier to debug than a macro expansion.
use crate::filter::dsl::error::{DslError, DslResult};
/// A single lexical token.
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
/// Bare identifier, e.g. `status`, `body`, `host`.
Ident(String),
/// Quoted or unquoted string literal.
Str(String),
/// Integer literal.
Int(i64),
/// Regex literal delimited by `/.../`.
Regex(String),
Eq,
Ne,
Lt,
Le,
Gt,
Ge,
Tilde,
LParen,
RParen,
And,
Or,
Not,
}
/// A token with its source column, used for error messages.
#[derive(Debug, Clone)]
pub struct Spanned {
pub token: Token,
pub column: usize,
}
pub fn tokenize(input: &str) -> DslResult<Vec<Spanned>> {
let mut out = Vec::new();
let bytes = input.as_bytes();
let mut i = 0;
while i < bytes.len() {
let start = i + 1;
let c = bytes[i];
match c {
b' ' | b'\t' | b'\n' => {
i += 1;
}
b'(' => {
out.push(Spanned { token: Token::LParen, column: start });
i += 1;
}
b')' => {
out.push(Spanned { token: Token::RParen, column: start });
i += 1;
}
b'~' => {
out.push(Spanned { token: Token::Tilde, column: start });
i += 1;
}
b'=' => {
out.push(Spanned { token: Token::Eq, column: start });
i += 1;
}
b'!' if peek(bytes, i + 1) == Some(b'=') => {
out.push(Spanned { token: Token::Ne, column: start });
i += 2;
}
b'<' if peek(bytes, i + 1) == Some(b'=') => {
out.push(Spanned { token: Token::Le, column: start });
i += 2;
}
b'<' => {
out.push(Spanned { token: Token::Lt, column: start });
i += 1;
}
b'>' if peek(bytes, i + 1) == Some(b'=') => {
out.push(Spanned { token: Token::Ge, column: start });
i += 2;
}
b'>' => {
out.push(Spanned { token: Token::Gt, column: start });
i += 1;
}
b'"' => {
let (literal, consumed) = read_string(&bytes[i..], start)?;
out.push(Spanned { token: Token::Str(literal), column: start });
i += consumed;
}
b'/' => {
let (literal, consumed) = read_regex(&bytes[i..], start)?;
out.push(Spanned { token: Token::Regex(literal), column: start });
i += consumed;
}
b'0'..=b'9' | b'-' => {
let (value, consumed) = read_int(&bytes[i..], start)?;
out.push(Spanned { token: Token::Int(value), column: start });
i += consumed;
}
c if is_ident_start(c) => {
let (word, consumed) = read_ident(&bytes[i..]);
let token = match word.as_str() {
"and" => Token::And,
"or" => Token::Or,
"not" => Token::Not,
_ => Token::Ident(word),
};
out.push(Spanned { token, column: start });
i += consumed;
}
other => {
return Err(DslError::Syntax {
column: start,
message: format!("unexpected byte {:?}", other as char),
});
}
}
}
Ok(out)
}
fn peek(bytes: &[u8], i: usize) -> Option<u8> {
bytes.get(i).copied()
}
fn is_ident_start(c: u8) -> bool {
c.is_ascii_alphabetic() || c == b'_'
}
fn is_ident_cont(c: u8) -> bool {
is_ident_start(c) || c.is_ascii_digit() || c == b'.' || c == b'-'
}
fn read_ident(bytes: &[u8]) -> (String, usize) {
let mut end = 0;
while end < bytes.len() && is_ident_cont(bytes[end]) {
end += 1;
}
(String::from_utf8_lossy(&bytes[..end]).into_owned(), end)
}
fn read_string(bytes: &[u8], column: usize) -> DslResult<(String, usize)> {
let mut out = String::new();
let mut i = 1;
while i < bytes.len() {
match bytes[i] {
b'"' => return Ok((out, i + 1)),
b'\\' if i + 1 < bytes.len() => {
out.push(bytes[i + 1] as char);
i += 2;
}
c => {
out.push(c as char);
i += 1;
}
}
}
Err(DslError::Syntax {
column,
message: "unterminated string literal".into(),
})
}
fn read_regex(bytes: &[u8], column: usize) -> DslResult<(String, usize)> {
let mut i = 1;
let mut out = String::new();
while i < bytes.len() {
match bytes[i] {
b'/' => return Ok((out, i + 1)),
b'\\' if i + 1 < bytes.len() => {
out.push('\\');
out.push(bytes[i + 1] as char);
i += 2;
}
c => {
out.push(c as char);
i += 1;
}
}
}
Err(DslError::Syntax {
column,
message: "unterminated regex literal".into(),
})
}
fn read_int(bytes: &[u8], column: usize) -> DslResult<(i64, usize)> {
let mut end = if bytes[0] == b'-' { 1 } else { 0 };
while end < bytes.len() && bytes[end].is_ascii_digit() {
end += 1;
}
if end == 0 || (end == 1 && bytes[0] == b'-') {
return Err(DslError::Syntax {
column,
message: "expected digit".into(),
});
}
let raw = std::str::from_utf8(&bytes[..end]).unwrap();
let value = raw.parse::<i64>().map_err(|_| DslError::Syntax {
column,
message: "integer literal out of range".into(),
})?;
Ok((value, end))
}