From d2d342c7540d97a661ced45042caf78f2d743ae1 Mon Sep 17 00:00:00 2001 From: Georg Brandl Date: Sat, 4 Aug 2018 09:02:23 +0200 Subject: [PATCH] lexer: update to pest 1.0 --- Cargo.toml | 3 +- src/lex.pest | 77 ++++++++++++++++++++++++++ src/lex.rs | 151 ++++++++++----------------------------------------- src/main.rs | 3 +- 4 files changed, 110 insertions(+), 124 deletions(-) create mode 100644 src/lex.pest diff --git a/Cargo.toml b/Cargo.toml index c867b32..7b92f15 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,5 +7,6 @@ authors = ["Georg Brandl "] getopts = "*" rand = "*" time = "*" -pest = "*" encoding = "*" +pest = "*" +pest_derive = "*" diff --git a/src/lex.pest b/src/lex.pest new file mode 100644 index 0000000..0cd1bf7 --- /dev/null +++ b/src/lex.pest @@ -0,0 +1,77 @@ +whitespace = _{ (" " | "\n" | "\t")+ } + +tokens = _{ token+ } + +token = _{ NUMBER | syntax | gerund | verb | designator | operator | whitespace | UNKNOWN } + +NUMBER = { '0'..'9'+ } +UNKNOWN = { any } + +syntax = _{ WAX | WANE | PLEASEDO | DO | NOT | GETS | SUB | BY | + OHOHSEVEN | INTERSECTION | WOW | MESH } +WAX = { "(" } +WANE = { ")" } +PLEASEDO = { "PLEASE" ~ "DO"? } +DO = { "DO" } +NOT = { "NOT" | "N'T" } +GETS = { "<-" } +SUB = { "SUB" } +BY = { "BY" } +OHOHSEVEN = { "%" } +INTERSECTION = { "+" } +WOW = { "!" } +MESH = { "#" } + +verb = _{ NEXT | RESUME | FORGET | IGNORE | REMEMBER | STASH | + RETRIEVE | ABSTAIN | FROM | REINSTATE | COMEFROM | + READOUT | WRITEIN | TRYAGAIN | GIVEUP } +NEXT = { "NEXT" } +RESUME = { "RESUME" } +FORGET = { "FORGET" } +IGNORE = { "IGNORE" } +REMEMBER = { "REMEMBER" } +STASH = { "STASH" } +RETRIEVE = { "RETRIEVE" } +ABSTAIN = { "ABSTAIN" } +FROM = { "FROM" } +REINSTATE = { "REINSTATE" } +COMEFROM = { "COME" ~ "FROM" } +READOUT = { "READ" ~ "OUT" } +WRITEIN = { "WRITE" ~ "IN" } +TRYAGAIN = { "TRY" ~ "AGAIN" } +GIVEUP = { "GIVE" ~ "UP" } + +gerund = _{ CALCULATING | NEXTING | RESUMING | FORGETTING | + IGNORING | REMEMBERING | STASHING | RETRIEVING | + ABSTAINING | REINSTATING | COMINGFROM | READINGOUT | + WRITINGIN | TRYINGAGAIN } +CALCULATING = { "CALCULATING" } +NEXTING = { "NEXTING" } +RESUMING = { "RESUMING" } +FORGETTING = { "FORGETTING" } +IGNORING = { "IGNORING" } +REMEMBERING = { "REMEMBERING" } +STASHING = { "STASHING" } +RETRIEVING = { "RETRIEVING" } +ABSTAINING = { "ABSTAINING" } +REINSTATING = { "REINSTATING" } +COMINGFROM = { "COMING" ~ "FROM" } +READINGOUT = { "READING" ~ "OUT" } +WRITINGIN = { "WRITING" ~ "IN" } +TRYINGAGAIN = { "TRYING" ~ "AGAIN" } + +designator = _{ SPOT | TWOSPOT | TAIL | HYBRID } +SPOT = { "." } +TWOSPOT = { ":" } +TAIL = { "," } +HYBRID = { ";" } + +operator = _{ RABBITEARS | SPARK | MONEY | SQUIGGLE | + AMPERSAND | BOOK | WHAT } +RABBITEARS = { "\"" } +SPARK = { "'" } +MONEY = { "$" | "¢" | "¤" | "£" | "€" } +SQUIGGLE = { "~" } +AMPERSAND = { "&" } +BOOK = { "V" } +WHAT = { "?" | "∀" } \ No newline at end of file diff --git a/src/lex.rs b/src/lex.rs index cfd374c..af448e9 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -17,97 +17,19 @@ #![allow(non_snake_case)] +use pest::Parser; +use pest::iterators::Pairs; + +pub type SrcLine = usize; + /// A lexer for INTERCAL generated with Pest. /// /// The raw Pest lexer is wrapped by a buffer iterator that adds a few /// special methods, such as the pretty standard "peek" and "push back" features. -use pest::prelude::*; - - -pub type SrcLine = usize; - - -impl_rdp! { - grammar! { - whitespace = { ([" "] | ["\n"] | ["\t"])+ } - - token = _{ NUMBER | syntax | gerund | verb | designator | operator | - whitespace | UNKNOWN } - - NUMBER = { ['0'..'9']+ } - UNKNOWN = { any } - - syntax = _{ WAX | WANE | PLEASEDO | DO | NOT | GETS | SUB | BY | - OHOHSEVEN | INTERSECTION | WOW | MESH } - WAX = { ["("] } - WANE = { [")"] } - PLEASEDO = { ["PLEASE"] ~ ["DO"]? } - DO = { ["DO"] } - NOT = { ["NOT"] | ["N'T"] } - GETS = { ["<-"] } - SUB = { ["SUB"] } - BY = { ["BY"] } - OHOHSEVEN = { ["%"] } - INTERSECTION = { ["+"] } - WOW = { ["!"] } - MESH = { ["#"] } - - verb = _{ NEXT | RESUME | FORGET | IGNORE | REMEMBER | STASH | - RETRIEVE | ABSTAIN | FROM | REINSTATE | COMEFROM | - READOUT | WRITEIN | TRYAGAIN | GIVEUP } - NEXT = { ["NEXT"] } - RESUME = { ["RESUME"] } - FORGET = { ["FORGET"] } - IGNORE = { ["IGNORE"] } - REMEMBER = { ["REMEMBER"] } - STASH = { ["STASH"] } - RETRIEVE = { ["RETRIEVE"] } - ABSTAIN = { ["ABSTAIN"] } - FROM = { ["FROM"] } - REINSTATE = { ["REINSTATE"] } - COMEFROM = { ["COME"] ~ ["FROM"] } - READOUT = { ["READ"] ~ ["OUT"] } - WRITEIN = { ["WRITE"] ~ ["IN"] } - TRYAGAIN = { ["TRY"] ~ ["AGAIN"] } - GIVEUP = { ["GIVE"] ~ ["UP"] } - - gerund = _{ CALCULATING | NEXTING | RESUMING | FORGETTING | - IGNORING | REMEMBERING | STASHING | RETRIEVING | - ABSTAINING | REINSTATING | COMINGFROM | READINGOUT | - WRITINGIN | TRYINGAGAIN } - CALCULATING = { ["CALCULATING"] } - NEXTING = { ["NEXTING"] } - RESUMING = { ["RESUMING"] } - FORGETTING = { ["FORGETTING"] } - IGNORING = { ["IGNORING"] } - REMEMBERING = { ["REMEMBERING"] } - STASHING = { ["STASHING"] } - RETRIEVING = { ["RETRIEVING"] } - ABSTAINING = { ["ABSTAINING"] } - REINSTATING = { ["REINSTATING"] } - COMINGFROM = { ["COMING"] ~ ["FROM"] } - READINGOUT = { ["READING"] ~ ["OUT"] } - WRITINGIN = { ["WRITING"] ~ ["IN"] } - TRYINGAGAIN = { ["TRYING"] ~ ["AGAIN"] } - - designator = _{ SPOT | TWOSPOT | TAIL | HYBRID } - SPOT = { ["."] } - TWOSPOT = { [":"] } - TAIL = { [","] } - HYBRID = { [";"] } - - operator = _{ RABBITEARS | SPARK | MONEY | SQUIGGLE | - AMPERSAND | BOOK | WHAT } - RABBITEARS = { ["\""] } - SPARK = { ["'"] } - MONEY = { ["$"] | ["¢"] | ["¤"] | ["£"] | ["€"] } - SQUIGGLE = { ["~"] } - AMPERSAND = { ["&"] } - BOOK = { ["V"] } - WHAT = { ["?"] | ["∀"] } - } -} +#[derive(Parser)] +#[grammar = "lex.pest"] +struct PestLexer; pub struct SrcToken { pub line: SrcLine, @@ -116,10 +38,10 @@ pub struct SrcToken { } pub struct Lexer<'a> { - rdp: Rdp>, - rdpline: SrcLine, - stash: Vec, - lastline: SrcLine, + inner: Pairs<'a, Rule>, + startline: SrcLine, + stash: Vec, + lastline: SrcLine, } impl<'a> Iterator for Lexer<'a> { @@ -138,38 +60,23 @@ impl<'a> Iterator for Lexer<'a> { impl<'a> Lexer<'a> { fn inner_next(&mut self) -> Option { - // if we have some tokens stashed, just emit them - if !self.stash.is_empty() { - return self.stash.pop(); - } - // else, request a new token from the lexer - while self.rdp.token() { - let line = self.rdpline; - // the queue can only ever consist of either a single token, - // or a token and an "inner" whitespace token - while let Some(tok) = self.rdp.queue_mut().pop() { - // jump over whitespace, but count up the line breaks - if tok.rule == Rule::whitespace { - self.rdpline += self.rdp.input().slice(tok.start, tok.end) - .chars().filter(|&ch| ch == '\n').count(); - continue; - } - // convert from pest's Token to SrcToken - let srctoken = if tok.rule == Rule::NUMBER { - let text = self.rdp.input().slice(tok.start, tok.end); - SrcToken { line: line, rule: Rule::NUMBER, - value: text.parse().unwrap_or(u32::max_value()) } - } else if tok.rule == Rule::WOW { + self.stash.pop().or_else(|| { + self.inner.next().map(|pair| { + let rule = pair.as_rule(); + let text = pair.as_str(); + let line = pair.into_span().end_pos().line_col().0 - 1 + self.startline; + // convert into SrcToken + if rule == Rule::NUMBER { + SrcToken { line, rule, value: text.trim().parse().unwrap_or(u32::max_value()) } + } else if rule == Rule::WOW { // handle ! = '. combination self.stash.push(SrcToken { line: line, rule: Rule::SPOT, value: 0 }); - SrcToken { line: line, rule: Rule::SPARK, value: 0 } + SrcToken { line, rule: Rule::SPARK, value: 0 } } else { - SrcToken { line: line, rule: tok.rule, value: 0 } - }; - return Some(srctoken); - } - } - None + SrcToken { line, rule, value: 0 } + } + }) + }) } pub fn peek(&mut self) -> Option { @@ -195,7 +102,7 @@ impl<'a> Lexer<'a> { } pub fn lex(s: &str, startline: usize) -> Lexer { - let input = StringInput::new(s); - Lexer { rdp: Rdp::new(input), rdpline: startline, - stash: vec![], lastline: startline } + // always succeeds since we have an UNKNOWN token + let inner = PestLexer::parse(Rule::tokens, s).unwrap(); + Lexer { inner, startline, stash: vec![], lastline: startline } } diff --git a/src/main.rs b/src/main.rs index 4b9ea37..803a9d9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,8 +22,9 @@ /// /// Parses arguments, calls parser, optimizer, interpreter or code generator. -#[macro_use] extern crate pest; +#[macro_use] +extern crate pest_derive; extern crate getopts; extern crate rand; extern crate time;