use crate::{Error, Span};
use std::borrow::Cow;
use std::char;
use std::fmt;
use std::iter;
use std::str;
#[derive(Clone)]
pub struct Lexer<'a> {
it: iter::Peekable<str::CharIndices<'a>>,
input: &'a str,
}
#[derive(Debug, PartialEq)]
pub enum Token<'a> {
LineComment(&'a str),
BlockComment(&'a str),
Whitespace(&'a str),
LParen(&'a str),
RParen(&'a str),
String(WasmString<'a>),
Id(&'a str),
Keyword(&'a str),
Reserved(&'a str),
Integer(Integer<'a>),
Float(Float<'a>),
}
#[derive(Debug, Clone, PartialEq)]
pub enum LexError {
DanglingBlockComment,
Unexpected(char),
InvalidStringElement(char),
InvalidStringEscape(char),
InvalidHexDigit(char),
InvalidDigit(char),
Expected {
wanted: char,
found: char,
},
UnexpectedEof,
NumberTooBig,
InvalidUnicodeValue(u32),
LoneUnderscore,
#[doc(hidden)]
__Nonexhaustive,
}
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum SignToken {
Plus,
Minus,
}
#[derive(Debug, PartialEq)]
pub struct Integer<'a>(Box<IntegerInner<'a>>);
#[derive(Debug, PartialEq)]
struct IntegerInner<'a> {
sign: Option<SignToken>,
src: &'a str,
val: Cow<'a, str>,
hex: bool,
}
#[derive(Debug, PartialEq)]
pub struct Float<'a>(Box<FloatInner<'a>>);
#[derive(Debug, PartialEq)]
struct FloatInner<'a> {
src: &'a str,
val: FloatVal<'a>,
}
#[derive(Debug, PartialEq)]
pub struct WasmString<'a>(Box<WasmStringInner<'a>>);
#[derive(Debug, PartialEq)]
struct WasmStringInner<'a> {
src: &'a str,
val: Cow<'a, [u8]>,
}
#[derive(Debug, PartialEq)]
pub enum FloatVal<'a> {
Nan {
val: Option<u64>,
negative: bool,
},
Inf {
#[allow(missing_docs)]
negative: bool,
},
Val {
hex: bool,
integral: Cow<'a, str>,
decimal: Option<Cow<'a, str>>,
exponent: Option<Cow<'a, str>>,
},
}
impl<'a> Lexer<'a> {
pub fn new(input: &str) -> Lexer<'_> {
Lexer {
it: input.char_indices().peekable(),
input,
}
}
pub fn input(&self) -> &'a str {
self.input
}
pub fn parse(&mut self) -> Result<Option<Token<'a>>, Error> {
if let Some(ws) = self.ws() {
return Ok(Some(Token::Whitespace(ws)));
}
if let Some(comment) = self.comment()? {
return Ok(Some(comment));
}
if let Some(token) = self.token()? {
return Ok(Some(token));
}
match self.it.next() {
Some((i, ch)) => Err(self.error(i, LexError::Unexpected(ch))),
None => Ok(None),
}
}
fn token(&mut self) -> Result<Option<Token<'a>>, Error> {
if let Some(pos) = self.eat_char('(') {
return Ok(Some(Token::LParen(&self.input[pos..pos + 1])));
}
if let Some(pos) = self.eat_char(')') {
return Ok(Some(Token::RParen(&self.input[pos..pos + 1])));
}
if let Some(pos) = self.eat_char('"') {
let val = self.string()?;
let src = &self.input[pos..self.cur()];
return Ok(Some(Token::String(WasmString(Box::new(WasmStringInner {
val,
src,
})))));
}
let (start, prefix) = match self.it.peek().cloned() {
Some((i, ch)) if is_idchar(ch) => (i, ch),
Some((i, ch)) if is_reserved_extra(ch) => {
self.it.next();
return Ok(Some(Token::Reserved(&self.input[i..self.cur()])));
}
Some((i, ch)) => return Err(self.error(i, LexError::Unexpected(ch))),
None => return Ok(None),
};
while let Some((_, ch)) = self.it.peek().cloned() {
if is_idchar(ch) {
self.it.next();
} else {
break;
}
}
let reserved = &self.input[start..self.cur()];
if let Some(number) = self.number(reserved) {
Ok(Some(number))
} else if prefix == '$' && reserved.len() > 1 {
Ok(Some(Token::Id(reserved)))
} else if 'a' <= prefix && prefix <= 'z' {
Ok(Some(Token::Keyword(reserved)))
} else {
Ok(Some(Token::Reserved(reserved)))
}
}
fn number(&self, src: &'a str) -> Option<Token<'a>> {
let (sign, num) = if src.starts_with('+') {
(Some(SignToken::Plus), &src[1..])
} else if src.starts_with('-') {
(Some(SignToken::Minus), &src[1..])
} else {
(None, src)
};
let negative = sign == Some(SignToken::Minus);
if num == "inf" {
return Some(Token::Float(Float(Box::new(FloatInner {
src,
val: FloatVal::Inf { negative },
}))));
} else if num == "nan" {
return Some(Token::Float(Float(Box::new(FloatInner {
src,
val: FloatVal::Nan {
val: None,
negative,
},
}))));
} else if num.starts_with("nan:0x") {
let mut it = num[6..].chars();
let to_parse = skip_undescores(&mut it, false, char::is_ascii_hexdigit)?;
if it.next().is_some() {
return None;
}
let n = u64::from_str_radix(&to_parse, 16).ok()?;
return Some(Token::Float(Float(Box::new(FloatInner {
src,
val: FloatVal::Nan {
val: Some(n),
negative,
},
}))));
}
let (mut it, hex, test_valid) = if num.starts_with("0x") {
(
num[2..].chars(),
true,
char::is_ascii_hexdigit as fn(&char) -> bool,
)
} else {
(
num.chars(),
false,
char::is_ascii_digit as fn(&char) -> bool,
)
};
let val = skip_undescores(&mut it, negative, test_valid)?;
match it.clone().next() {
Some(_) => {}
None => {
return Some(Token::Integer(Integer(Box::new(IntegerInner {
sign,
src,
val,
hex,
}))))
}
}
let decimal = if it.clone().next() == Some('.') {
it.next();
match it.clone().next() {
Some(c) if test_valid(&c) => Some(skip_undescores(&mut it, false, test_valid)?),
Some(_) | None => None,
}
} else {
None
};
let exponent = match (hex, it.next()) {
(true, Some('p')) | (true, Some('P')) | (false, Some('e')) | (false, Some('E')) => {
let negative = match it.clone().next() {
Some('-') => {
it.next();
true
}
Some('+') => {
it.next();
false
}
_ => false,
};
Some(skip_undescores(&mut it, negative, char::is_ascii_digit)?)
}
(_, None) => None,
_ => return None,
};
if it.next().is_some() {
return None;
}
return Some(Token::Float(Float(Box::new(FloatInner {
src,
val: FloatVal::Val {
hex,
integral: val,
exponent,
decimal,
},
}))));
fn skip_undescores<'a>(
it: &mut str::Chars<'a>,
negative: bool,
good: fn(&char) -> bool,
) -> Option<Cow<'a, str>> {
enum State {
Raw,
Collecting(String),
}
let mut last_underscore = false;
let mut state = if negative {
State::Collecting("-".to_string())
} else {
State::Raw
};
let input = it.as_str();
let first = it.next()?;
if !good(&first) {
return None;
}
if let State::Collecting(s) = &mut state {
s.push(first);
}
let mut last = 1;
while let Some(c) = it.clone().next() {
if c == '_' && !last_underscore {
if let State::Raw = state {
state = State::Collecting(input[..last].to_string());
}
it.next();
last_underscore = true;
continue;
}
if !good(&c) {
break;
}
if let State::Collecting(s) = &mut state {
s.push(c);
}
last_underscore = false;
it.next();
last += 1;
}
if last_underscore {
return None;
}
Some(match state {
State::Raw => input[..last].into(),
State::Collecting(s) => s.into(),
})
}
}
fn ws(&mut self) -> Option<&'a str> {
let start = self.cur();
loop {
match self.it.peek() {
Some((_, ' ')) | Some((_, '\n')) | Some((_, '\r')) | Some((_, '\t')) => {
drop(self.it.next())
}
_ => break,
}
}
let end = self.cur();
if start != end {
Some(&self.input[start..end])
} else {
None
}
}
fn comment(&mut self) -> Result<Option<Token<'a>>, Error> {
if let Some(start) = self.eat_str(";;") {
loop {
match self.it.peek() {
None | Some((_, '\n')) => break,
_ => drop(self.it.next()),
}
}
let end = self.cur();
return Ok(Some(Token::LineComment(&self.input[start..end])));
}
if let Some(start) = self.eat_str("(;") {
let mut level = 1;
while let Some((_, ch)) = self.it.next() {
if ch == '(' && self.eat_char(';').is_some() {
level += 1;
}
if ch == ';' && self.eat_char(')').is_some() {
level -= 1;
if level == 0 {
let end = self.cur();
return Ok(Some(Token::BlockComment(&self.input[start..end])));
}
}
}
return Err(self.error(start, LexError::DanglingBlockComment));
}
Ok(None)
}
fn string(&mut self) -> Result<Cow<'a, [u8]>, Error> {
enum State {
Start(usize),
String(Vec<u8>),
}
let mut state = State::Start(self.cur());
loop {
match self.it.next() {
Some((i, '\\')) => {
match state {
State::String(_) => {}
State::Start(start) => {
state = State::String(self.input[start..i].as_bytes().to_vec());
}
}
let buf = match &mut state {
State::String(b) => b,
State::Start(_) => unreachable!(),
};
match self.it.next() {
Some((_, '"')) => buf.push(b'"'),
Some((_, '\'')) => buf.push(b'\''),
Some((_, 't')) => buf.push(b'\t'),
Some((_, 'n')) => buf.push(b'\n'),
Some((_, 'r')) => buf.push(b'\r'),
Some((_, '\\')) => buf.push(b'\\'),
Some((i, 'u')) => {
self.must_eat_char('{')?;
let n = self.hexnum()?;
let c = char::from_u32(n)
.ok_or_else(|| self.error(i, LexError::InvalidUnicodeValue(n)))?;
buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
self.must_eat_char('}')?;
}
Some((_, c1)) if c1.is_ascii_hexdigit() => {
let (_, c2) = self.hexdigit()?;
buf.push(to_hex(c1) * 16 + c2);
}
Some((i, c)) => return Err(self.error(i, LexError::InvalidStringEscape(c))),
None => return Err(self.error(self.input.len(), LexError::UnexpectedEof)),
}
}
Some((_, '"')) => break,
Some((i, c)) => {
if (c as u32) < 0x20 || c as u32 == 0x7f {
return Err(self.error(i, LexError::InvalidStringElement(c)));
}
match &mut state {
State::Start(_) => {}
State::String(v) => {
v.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
}
}
}
None => return Err(self.error(self.input.len(), LexError::UnexpectedEof)),
}
}
match state {
State::Start(pos) => Ok(self.input[pos..self.cur() - 1].as_bytes().into()),
State::String(s) => Ok(s.into()),
}
}
fn hexnum(&mut self) -> Result<u32, Error> {
let (_, n) = self.hexdigit()?;
let mut last_underscore = false;
let mut n = n as u32;
while let Some((i, c)) = self.it.peek().cloned() {
if c == '_' {
self.it.next();
last_underscore = true;
continue;
}
if !c.is_ascii_hexdigit() {
break;
}
last_underscore = false;
self.it.next();
n = n
.checked_mul(16)
.and_then(|n| n.checked_add(to_hex(c) as u32))
.ok_or_else(|| self.error(i, LexError::NumberTooBig))?;
}
if last_underscore {
let cur = self.cur();
return Err(self.error(cur - 1, LexError::LoneUnderscore));
}
Ok(n)
}
fn hexdigit(&mut self) -> Result<(usize, u8), Error> {
let (i, ch) = self.must_char()?;
if ch.is_ascii_hexdigit() {
Ok((i, to_hex(ch)))
} else {
Err(self.error(i, LexError::InvalidHexDigit(ch)))
}
}
fn eat_str(&mut self, s: &str) -> Option<usize> {
if !self.cur_str().starts_with(s) {
return None;
}
let ret = self.cur();
for _ in s.chars() {
self.it.next();
}
Some(ret)
}
fn eat_char(&mut self, needle: char) -> Option<usize> {
match self.it.peek() {
Some((i, c)) if *c == needle => {
let ret = *i;
self.it.next();
Some(ret)
}
_ => None,
}
}
fn must_char(&mut self) -> Result<(usize, char), Error> {
self.it
.next()
.ok_or_else(|| self.error(self.input.len(), LexError::UnexpectedEof))
}
fn must_eat_char(&mut self, wanted: char) -> Result<usize, Error> {
let (pos, found) = self.must_char()?;
if wanted == found {
Ok(pos)
} else {
Err(self.error(pos, LexError::Expected { wanted, found }))
}
}
fn cur(&mut self) -> usize {
self.it.peek().map(|p| p.0).unwrap_or(self.input.len())
}
fn cur_str(&mut self) -> &'a str {
&self.input[self.cur()..]
}
fn error(&self, pos: usize, kind: LexError) -> Error {
Error::lex(Span { offset: pos }, self.input, kind)
}
}
impl<'a> Iterator for Lexer<'a> {
type Item = Result<Token<'a>, Error>;
fn next(&mut self) -> Option<Self::Item> {
self.parse().transpose()
}
}
impl<'a> Token<'a> {
pub fn src(&self) -> &'a str {
match self {
Token::Whitespace(s) => s,
Token::BlockComment(s) => s,
Token::LineComment(s) => s,
Token::LParen(s) => s,
Token::RParen(s) => s,
Token::String(s) => s.src(),
Token::Id(s) => s,
Token::Keyword(s) => s,
Token::Reserved(s) => s,
Token::Integer(i) => i.src(),
Token::Float(f) => f.src(),
}
}
}
impl<'a> Integer<'a> {
pub fn sign(&self) -> Option<SignToken> {
self.0.sign
}
pub fn src(&self) -> &'a str {
self.0.src
}
pub fn val(&self) -> (&str, u32) {
(&self.0.val, if self.0.hex { 16 } else { 10 })
}
}
impl<'a> Float<'a> {
pub fn src(&self) -> &'a str {
self.0.src
}
pub fn val(&self) -> &FloatVal<'a> {
&self.0.val
}
}
impl<'a> WasmString<'a> {
pub fn src(&self) -> &'a str {
self.0.src
}
pub fn val(&self) -> &[u8] {
&self.0.val
}
}
fn to_hex(c: char) -> u8 {
match c {
'a'..='f' => c as u8 - b'a' + 10,
'A'..='F' => c as u8 - b'A' + 10,
_ => c as u8 - b'0',
}
}
fn is_idchar(c: char) -> bool {
match c {
'0'..='9'
| 'a'..='z'
| 'A'..='Z'
| '!'
| '#'
| '$'
| '%'
| '&'
| '\''
| '*'
| '+'
| '-'
| '.'
| '/'
| ':'
| '<'
| '='
| '>'
| '?'
| '@'
| '\\'
| '^'
| '_'
| '`'
| '|'
| '~' => true,
_ => false,
}
}
fn is_reserved_extra(c: char) -> bool {
match c {
',' | ';' | '[' | ']' | '{' | '}' => true,
_ => false,
}
}
impl fmt::Display for LexError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use LexError::*;
match self {
DanglingBlockComment => f.write_str("unterminated block comment")?,
Unexpected(c) => write!(f, "unexpected character {:?}", c)?,
InvalidStringElement(c) => write!(f, "invalid character in string {:?}", c)?,
InvalidStringEscape(c) => write!(f, "invalid string escape {:?}", c)?,
InvalidHexDigit(c) => write!(f, "invalid hex digit {:?}", c)?,
InvalidDigit(c) => write!(f, "invalid decimal digit {:?}", c)?,
Expected { wanted, found } => write!(f, "expected {:?} but found {:?}", wanted, found)?,
UnexpectedEof => write!(f, "unexpected end-of-file")?,
NumberTooBig => f.write_str("number is too big to parse")?,
InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?,
LoneUnderscore => write!(f, "bare underscore in numeric literal")?,
__Nonexhaustive => unreachable!(),
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ws_smoke() {
fn get_whitespace(input: &str) -> &str {
match Lexer::new(input).parse().expect("no first token") {
Some(Token::Whitespace(s)) => s,
other => panic!("unexpected {:?}", other),
}
}
assert_eq!(get_whitespace(" "), " ");
assert_eq!(get_whitespace(" "), " ");
assert_eq!(get_whitespace(" \n "), " \n ");
assert_eq!(get_whitespace(" x"), " ");
assert_eq!(get_whitespace(" ;"), " ");
}
#[test]
fn line_comment_smoke() {
fn get_line_comment(input: &str) -> &str {
match Lexer::new(input).parse().expect("no first token") {
Some(Token::LineComment(s)) => s,
other => panic!("unexpected {:?}", other),
}
}
assert_eq!(get_line_comment(";;"), ";;");
assert_eq!(get_line_comment(";; xyz"), ";; xyz");
assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz");
assert_eq!(get_line_comment(";;\nabc"), ";;");
assert_eq!(get_line_comment(";; \nabc"), ";; ");
}
#[test]
fn block_comment_smoke() {
fn get_block_comment(input: &str) -> &str {
match Lexer::new(input).parse().expect("no first token") {
Some(Token::BlockComment(s)) => s,
other => panic!("unexpected {:?}", other),
}
}
assert_eq!(get_block_comment("(;;)"), "(;;)");
assert_eq!(get_block_comment("(; ;)"), "(; ;)");
assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)");
}
fn get_token(input: &str) -> Token<'_> {
Lexer::new(input)
.parse()
.expect("no first token")
.expect("no token")
}
#[test]
fn lparen() {
assert_eq!(get_token("(("), Token::LParen("("));
}
#[test]
fn rparen() {
assert_eq!(get_token(")("), Token::RParen(")"));
}
#[test]
fn strings() {
fn get_string(input: &str) -> Vec<u8> {
match get_token(input) {
Token::String(s) => {
assert_eq!(input, s.src());
s.val().to_vec()
}
other => panic!("not string {:?}", other),
}
}
assert_eq!(&*get_string("\"\""), b"");
assert_eq!(&*get_string("\"a\""), b"a");
assert_eq!(&*get_string("\"a b c d\""), b"a b c d");
assert_eq!(&*get_string("\"\\\"\""), b"\"");
assert_eq!(&*get_string("\"\\'\""), b"'");
assert_eq!(&*get_string("\"\\n\""), b"\n");
assert_eq!(&*get_string("\"\\t\""), b"\t");
assert_eq!(&*get_string("\"\\r\""), b"\r");
assert_eq!(&*get_string("\"\\\\\""), b"\\");
assert_eq!(&*get_string("\"\\01\""), &[1]);
assert_eq!(&*get_string("\"\\u{1}\""), &[1]);
assert_eq!(
&*get_string("\"\\u{0f3}\""),
'\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
);
assert_eq!(
&*get_string("\"\\u{0_f_3}\""),
'\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
);
for i in 0..=255i32 {
let s = format!("\"\\{:02x}\"", i);
assert_eq!(&*get_string(&s), &[i as u8]);
}
}
#[test]
fn id() {
fn get_id(input: &str) -> &str {
match get_token(input) {
Token::Id(s) => s,
other => panic!("not id {:?}", other),
}
}
assert_eq!(get_id("$x"), "$x");
assert_eq!(get_id("$xyz"), "$xyz");
assert_eq!(get_id("$x_z"), "$x_z");
assert_eq!(get_id("$0^"), "$0^");
assert_eq!(get_id("$0^;;"), "$0^");
assert_eq!(get_id("$0^ ;;"), "$0^");
}
#[test]
fn keyword() {
fn get_keyword(input: &str) -> &str {
match get_token(input) {
Token::Keyword(s) => s,
other => panic!("not id {:?}", other),
}
}
assert_eq!(get_keyword("x"), "x");
assert_eq!(get_keyword("xyz"), "xyz");
assert_eq!(get_keyword("x_z"), "x_z");
assert_eq!(get_keyword("x_z "), "x_z");
assert_eq!(get_keyword("x_z "), "x_z");
}
#[test]
fn reserved() {
fn get_reserved(input: &str) -> &str {
match get_token(input) {
Token::Reserved(s) => s,
other => panic!("not reserved {:?}", other),
}
}
assert_eq!(get_reserved("$ "), "$");
assert_eq!(get_reserved("^_x "), "^_x");
}
#[test]
fn integer() {
fn get_integer(input: &str) -> String {
match get_token(input) {
Token::Integer(i) => {
assert_eq!(input, i.src());
i.val().0.to_string()
}
other => panic!("not integer {:?}", other),
}
}
assert_eq!(get_integer("1"), "1");
assert_eq!(get_integer("0"), "0");
assert_eq!(get_integer("-1"), "-1");
assert_eq!(get_integer("+1"), "1");
assert_eq!(get_integer("+1_000"), "1000");
assert_eq!(get_integer("+1_0_0_0"), "1000");
assert_eq!(get_integer("+0x10"), "10");
assert_eq!(get_integer("-0x10"), "-10");
assert_eq!(get_integer("0x10"), "10");
}
#[test]
fn float() {
fn get_float(input: &str) -> FloatVal<'_> {
match get_token(input) {
Token::Float(i) => {
assert_eq!(input, i.src());
i.0.val
}
other => panic!("not reserved {:?}", other),
}
}
assert_eq!(
get_float("nan"),
FloatVal::Nan {
val: None,
negative: false
},
);
assert_eq!(
get_float("-nan"),
FloatVal::Nan {
val: None,
negative: true,
},
);
assert_eq!(
get_float("+nan"),
FloatVal::Nan {
val: None,
negative: false,
},
);
assert_eq!(
get_float("+nan:0x1"),
FloatVal::Nan {
val: Some(1),
negative: false,
},
);
assert_eq!(
get_float("nan:0x7f_ffff"),
FloatVal::Nan {
val: Some(0x7fffff),
negative: false,
},
);
assert_eq!(get_float("inf"), FloatVal::Inf { negative: false });
assert_eq!(get_float("-inf"), FloatVal::Inf { negative: true });
assert_eq!(get_float("+inf"), FloatVal::Inf { negative: false });
assert_eq!(
get_float("1.2"),
FloatVal::Val {
integral: "1".into(),
decimal: Some("2".into()),
exponent: None,
hex: false,
},
);
assert_eq!(
get_float("1.2e3"),
FloatVal::Val {
integral: "1".into(),
decimal: Some("2".into()),
exponent: Some("3".into()),
hex: false,
},
);
assert_eq!(
get_float("-1_2.1_1E+0_1"),
FloatVal::Val {
integral: "-12".into(),
decimal: Some("11".into()),
exponent: Some("01".into()),
hex: false,
},
);
assert_eq!(
get_float("+1_2.1_1E-0_1"),
FloatVal::Val {
integral: "12".into(),
decimal: Some("11".into()),
exponent: Some("-01".into()),
hex: false,
},
);
assert_eq!(
get_float("0x1_2.3_4p5_6"),
FloatVal::Val {
integral: "12".into(),
decimal: Some("34".into()),
exponent: Some("56".into()),
hex: true,
},
);
assert_eq!(
get_float("+0x1_2.3_4P-5_6"),
FloatVal::Val {
integral: "12".into(),
decimal: Some("34".into()),
exponent: Some("-56".into()),
hex: true,
},
);
assert_eq!(
get_float("1."),
FloatVal::Val {
integral: "1".into(),
decimal: None,
exponent: None,
hex: false,
},
);
assert_eq!(
get_float("0x1p-24"),
FloatVal::Val {
integral: "1".into(),
decimal: None,
exponent: Some("-24".into()),
hex: true,
},
);
}
}