319 lines
9.9 KiB
Rust
319 lines
9.9 KiB
Rust
use super::{
|
|
error::ParseError,
|
|
extra::ModuleExtra,
|
|
token::{Base, Token},
|
|
};
|
|
use crate::ast::Span;
|
|
use chumsky::prelude::*;
|
|
use num_bigint::BigInt;
|
|
use ordinal::Ordinal;
|
|
|
|
pub struct LexInfo {
|
|
pub tokens: Vec<(Token, Span)>,
|
|
pub extra: ModuleExtra,
|
|
}
|
|
|
|
pub fn run(src: &str) -> Result<LexInfo, Vec<ParseError>> {
|
|
let len = src.as_bytes().len();
|
|
|
|
let tokens = lexer().parse(chumsky::Stream::from_iter(
|
|
Span::create(len, 1),
|
|
src.chars().scan(0, |i, c| {
|
|
let start = *i;
|
|
let offset = c.len_utf8();
|
|
*i = start + offset;
|
|
Some((c, Span::create(start, offset)))
|
|
}),
|
|
))?;
|
|
|
|
let mut extra = ModuleExtra::new();
|
|
|
|
let mut previous_is_newline = false;
|
|
|
|
let tokens = tokens
|
|
.into_iter()
|
|
.filter_map(|(token, ref span)| {
|
|
let current_is_newline = token == Token::NewLine || token == Token::EmptyLine;
|
|
let result = match token {
|
|
Token::ModuleComment => {
|
|
extra.module_comments.push(*span);
|
|
None
|
|
}
|
|
Token::DocComment => {
|
|
extra.doc_comments.push(*span);
|
|
None
|
|
}
|
|
Token::Comment => {
|
|
extra.comments.push(*span);
|
|
None
|
|
}
|
|
Token::EmptyLine => {
|
|
extra.empty_lines.push(span.start);
|
|
None
|
|
}
|
|
Token::LeftParen => {
|
|
if previous_is_newline {
|
|
Some((Token::NewLineLeftParen, *span))
|
|
} else {
|
|
Some((Token::LeftParen, *span))
|
|
}
|
|
}
|
|
Token::Minus => {
|
|
if previous_is_newline {
|
|
Some((Token::NewLineMinus, *span))
|
|
} else {
|
|
Some((Token::Minus, *span))
|
|
}
|
|
}
|
|
Token::Pipe => {
|
|
if previous_is_newline {
|
|
Some((Token::NewLinePipe, *span))
|
|
} else {
|
|
Some((Token::Pipe, *span))
|
|
}
|
|
}
|
|
Token::NewLine => None,
|
|
_ => Some((token, *span)),
|
|
};
|
|
|
|
previous_is_newline = current_is_newline;
|
|
|
|
result
|
|
})
|
|
.collect::<Vec<(Token, Span)>>();
|
|
|
|
Ok(LexInfo { tokens, extra })
|
|
}
|
|
|
|
pub fn lexer() -> impl Parser<char, Vec<(Token, Span)>, Error = ParseError> {
|
|
let base10 = text::int(10).map(|value| Token::Int {
|
|
value,
|
|
base: Base::Decimal {
|
|
numeric_underscore: false,
|
|
},
|
|
});
|
|
|
|
let base10_underscore = one_of("0123456789")
|
|
.repeated()
|
|
.at_least(1)
|
|
.at_most(3)
|
|
.separated_by(just("_"))
|
|
.at_least(2)
|
|
.flatten()
|
|
.collect::<String>()
|
|
.map(|value| Token::Int {
|
|
value,
|
|
base: Base::Decimal {
|
|
numeric_underscore: true,
|
|
},
|
|
});
|
|
|
|
let base16 = just("0x")
|
|
.ignore_then(
|
|
one_of("0123456789abcdefABCDEF")
|
|
.repeated()
|
|
.at_least(1)
|
|
.collect::<String>(),
|
|
)
|
|
.validate(|value: String, span, emit| {
|
|
let value = match BigInt::parse_bytes(value.as_bytes(), 16) {
|
|
None => {
|
|
emit(ParseError::malformed_base16_digits(span));
|
|
String::new()
|
|
}
|
|
Some(n) => n.to_str_radix(10),
|
|
};
|
|
|
|
Token::Int {
|
|
value,
|
|
base: Base::Hexadecimal,
|
|
}
|
|
});
|
|
|
|
let int = choice((base16, base10_underscore, base10));
|
|
|
|
let ordinal = text::int(10)
|
|
.then_with(|index: String| {
|
|
choice((just("st"), just("nd"), just("rd"), just("th")))
|
|
.map(move |suffix| (index.to_string(), suffix))
|
|
})
|
|
.validate(|(index, suffix), span, emit| match index.parse() {
|
|
Err { .. } => {
|
|
emit(ParseError::invalid_tuple_index(span, index, None));
|
|
Token::Ordinal { index: 0 }
|
|
}
|
|
Ok(index) => {
|
|
let expected_suffix = Ordinal::<u32>(index).suffix();
|
|
if expected_suffix != suffix {
|
|
emit(ParseError::invalid_tuple_index(
|
|
span,
|
|
index.to_string(),
|
|
Some(expected_suffix.to_string()),
|
|
))
|
|
}
|
|
Token::Ordinal { index }
|
|
}
|
|
});
|
|
|
|
let op = choice((
|
|
just("==").to(Token::EqualEqual),
|
|
just('=').to(Token::Equal),
|
|
just("..").to(Token::DotDot),
|
|
just('.').to(Token::Dot),
|
|
just("!=").to(Token::NotEqual),
|
|
just('!').to(Token::Bang),
|
|
just('?').to(Token::Question),
|
|
just("<-").to(Token::LArrow),
|
|
just("->").to(Token::RArrow),
|
|
choice((
|
|
just("<=").to(Token::LessEqual),
|
|
just('<').to(Token::Less),
|
|
just(">=").to(Token::GreaterEqual),
|
|
just('>').to(Token::Greater),
|
|
)),
|
|
just('+').to(Token::Plus),
|
|
just('-').to(Token::Minus),
|
|
just('*').to(Token::Star),
|
|
just('/').to(Token::Slash),
|
|
just('%').to(Token::Percent),
|
|
just("|>").to(Token::Pipe),
|
|
just(',').to(Token::Comma),
|
|
just(':').to(Token::Colon),
|
|
just("||").to(Token::VbarVbar),
|
|
just('|').to(Token::Vbar),
|
|
just("&&").to(Token::AmperAmper),
|
|
just('#').to(Token::Hash),
|
|
));
|
|
|
|
let grouping = choice((
|
|
just('(').to(Token::LeftParen),
|
|
just(')').to(Token::RightParen),
|
|
just('[').to(Token::LeftSquare),
|
|
just(']').to(Token::RightSquare),
|
|
just('{').to(Token::LeftBrace),
|
|
just('}').to(Token::RightBrace),
|
|
));
|
|
|
|
let escape = just('\\').ignore_then(
|
|
just('\\')
|
|
.or(just('"'))
|
|
.or(just('n').to('\n'))
|
|
.or(just('r').to('\r'))
|
|
.or(just('t').to('\t'))
|
|
.or(just('0').to('\0')),
|
|
);
|
|
|
|
let string = just('@')
|
|
.ignore_then(just('"'))
|
|
.ignore_then(filter(|c| *c != '\\' && *c != '"').or(escape).repeated())
|
|
.then_ignore(just('"'))
|
|
.collect::<String>()
|
|
.map(|value| Token::String { value })
|
|
.labelled("string");
|
|
|
|
let bytestring = just('"')
|
|
.ignore_then(filter(|c| *c != '\\' && *c != '"').or(escape).repeated())
|
|
.then_ignore(just('"'))
|
|
.collect::<String>()
|
|
.map(|value| Token::ByteString { value })
|
|
.labelled("bytestring");
|
|
|
|
let keyword = text::ident().map(|s: String| match s.as_str() {
|
|
"trace" => Token::Trace,
|
|
// TODO: remove this in a future release
|
|
"error" => Token::Fail,
|
|
"fail" => Token::Fail,
|
|
"once" => Token::Once,
|
|
"as" => Token::As,
|
|
"and" => Token::And,
|
|
"or" => Token::Or,
|
|
"expect" => Token::Expect,
|
|
"const" => Token::Const,
|
|
"fn" => Token::Fn,
|
|
"test" => Token::Test,
|
|
"if" => Token::If,
|
|
"else" => Token::Else,
|
|
"is" => Token::Is,
|
|
"let" => Token::Let,
|
|
"opaque" => Token::Opaque,
|
|
"pub" => Token::Pub,
|
|
"use" => Token::Use,
|
|
"todo" => Token::Todo,
|
|
"type" => Token::Type,
|
|
"when" => Token::When,
|
|
"validator" => Token::Validator,
|
|
"via" => Token::Via,
|
|
_ => {
|
|
if s.chars().next().map_or(false, |c| c.is_uppercase()) {
|
|
Token::UpName {
|
|
// TODO: do not allow _ in upname
|
|
name: s,
|
|
}
|
|
} else if s.starts_with('_') {
|
|
Token::DiscardName {
|
|
// TODO: do not allow uppercase letters in discard name
|
|
name: s,
|
|
}
|
|
} else {
|
|
Token::Name {
|
|
// TODO: do not allow uppercase letters in name
|
|
name: s,
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
fn comment_parser(token: Token) -> impl Parser<char, (Token, Span), Error = ParseError> {
|
|
let n = match token {
|
|
Token::ModuleComment => 4,
|
|
Token::DocComment => 3,
|
|
Token::Comment => 2,
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
choice((
|
|
// NOTE: The first case here work around a bug introduced with chumsky=0.9.0 which
|
|
// miscalculate the offset for empty comments.
|
|
just("/".repeat(n))
|
|
.ignore_then(choice((text::newline().rewind(), end())))
|
|
.to(token.clone())
|
|
.map_with_span(move |token, span: Span| {
|
|
(token, Span::new((), span.start + n..span.end))
|
|
}),
|
|
just("/".repeat(n)).ignore_then(
|
|
take_until(choice((text::newline().rewind(), end())))
|
|
.to(token)
|
|
.map_with_span(|token, span| (token, span)),
|
|
),
|
|
))
|
|
}
|
|
|
|
let newlines = choice((
|
|
choice((just("\n\n"), just("\r\n\r\n"))).to(Token::EmptyLine),
|
|
choice((just("\n"), just("\r\n"))).to(Token::NewLine),
|
|
));
|
|
|
|
choice((
|
|
comment_parser(Token::ModuleComment),
|
|
comment_parser(Token::DocComment),
|
|
comment_parser(Token::Comment),
|
|
choice((
|
|
ordinal, keyword, int, op, newlines, grouping, bytestring, string,
|
|
))
|
|
.or(any().map(Token::Error).validate(|t, span, emit| {
|
|
emit(ParseError::expected_input_found(
|
|
span,
|
|
None,
|
|
Some(t.clone()),
|
|
));
|
|
t
|
|
}))
|
|
.map_with_span(|token, span| (token, span)),
|
|
))
|
|
.padded_by(one_of(" \t").ignored().repeated())
|
|
.recover_with(skip_then_retry_until([]))
|
|
.repeated()
|
|
.padded_by(one_of(" \t").ignored().repeated())
|
|
.then_ignore(end())
|
|
}
|