Skip to content

Commit

Permalink
lexer: update to pest 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
birkenfeld committed Aug 4, 2018
1 parent b5dfb55 commit d2d342c
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 124 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@ authors = ["Georg Brandl <[email protected]>"]
getopts = "*"
rand = "*"
time = "*"
pest = "*"
encoding = "*"
pest = "*"
pest_derive = "*"
77 changes: 77 additions & 0 deletions src/lex.pest
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
whitespace = _{ (" " | "\n" | "\t")+ }

tokens = _{ token+ }

token = _{ NUMBER | syntax | gerund | verb | designator | operator | whitespace | UNKNOWN }

NUMBER = { '0'..'9'+ }
UNKNOWN = { any }

syntax = _{ WAX | WANE | PLEASEDO | DO | NOT | GETS | SUB | BY |
OHOHSEVEN | INTERSECTION | WOW | MESH }
WAX = { "(" }
WANE = { ")" }
PLEASEDO = { "PLEASE" ~ "DO"? }
DO = { "DO" }
NOT = { "NOT" | "N'T" }
GETS = { "<-" }
SUB = { "SUB" }
BY = { "BY" }
OHOHSEVEN = { "%" }
INTERSECTION = { "+" }
WOW = { "!" }
MESH = { "#" }

verb = _{ NEXT | RESUME | FORGET | IGNORE | REMEMBER | STASH |
RETRIEVE | ABSTAIN | FROM | REINSTATE | COMEFROM |
READOUT | WRITEIN | TRYAGAIN | GIVEUP }
NEXT = { "NEXT" }
RESUME = { "RESUME" }
FORGET = { "FORGET" }
IGNORE = { "IGNORE" }
REMEMBER = { "REMEMBER" }
STASH = { "STASH" }
RETRIEVE = { "RETRIEVE" }
ABSTAIN = { "ABSTAIN" }
FROM = { "FROM" }
REINSTATE = { "REINSTATE" }
COMEFROM = { "COME" ~ "FROM" }
READOUT = { "READ" ~ "OUT" }
WRITEIN = { "WRITE" ~ "IN" }
TRYAGAIN = { "TRY" ~ "AGAIN" }
GIVEUP = { "GIVE" ~ "UP" }

gerund = _{ CALCULATING | NEXTING | RESUMING | FORGETTING |
IGNORING | REMEMBERING | STASHING | RETRIEVING |
ABSTAINING | REINSTATING | COMINGFROM | READINGOUT |
WRITINGIN | TRYINGAGAIN }
CALCULATING = { "CALCULATING" }
NEXTING = { "NEXTING" }
RESUMING = { "RESUMING" }
FORGETTING = { "FORGETTING" }
IGNORING = { "IGNORING" }
REMEMBERING = { "REMEMBERING" }
STASHING = { "STASHING" }
RETRIEVING = { "RETRIEVING" }
ABSTAINING = { "ABSTAINING" }
REINSTATING = { "REINSTATING" }
COMINGFROM = { "COMING" ~ "FROM" }
READINGOUT = { "READING" ~ "OUT" }
WRITINGIN = { "WRITING" ~ "IN" }
TRYINGAGAIN = { "TRYING" ~ "AGAIN" }

designator = _{ SPOT | TWOSPOT | TAIL | HYBRID }
SPOT = { "." }
TWOSPOT = { ":" }
TAIL = { "," }
HYBRID = { ";" }

operator = _{ RABBITEARS | SPARK | MONEY | SQUIGGLE |
AMPERSAND | BOOK | WHAT }
RABBITEARS = { "\"" }
SPARK = { "'" }
MONEY = { "$" | "¢" | "¤" | "£" | "€" }
SQUIGGLE = { "~" }
AMPERSAND = { "&" }
BOOK = { "V" }
WHAT = { "?" | "∀" }
151 changes: 29 additions & 122 deletions src/lex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,97 +17,19 @@

#![allow(non_snake_case)]

use pest::Parser;
use pest::iterators::Pairs;

pub type SrcLine = usize;

/// A lexer for INTERCAL generated with Pest.
///
/// The raw Pest lexer is wrapped by a buffer iterator that adds a few
/// special methods, such as the pretty standard "peek" and "push back" features.
use pest::prelude::*;


pub type SrcLine = usize;


impl_rdp! {
grammar! {
whitespace = { ([" "] | ["\n"] | ["\t"])+ }

token = _{ NUMBER | syntax | gerund | verb | designator | operator |
whitespace | UNKNOWN }

NUMBER = { ['0'..'9']+ }
UNKNOWN = { any }

syntax = _{ WAX | WANE | PLEASEDO | DO | NOT | GETS | SUB | BY |
OHOHSEVEN | INTERSECTION | WOW | MESH }
WAX = { ["("] }
WANE = { [")"] }
PLEASEDO = { ["PLEASE"] ~ ["DO"]? }
DO = { ["DO"] }
NOT = { ["NOT"] | ["N'T"] }
GETS = { ["<-"] }
SUB = { ["SUB"] }
BY = { ["BY"] }
OHOHSEVEN = { ["%"] }
INTERSECTION = { ["+"] }
WOW = { ["!"] }
MESH = { ["#"] }

verb = _{ NEXT | RESUME | FORGET | IGNORE | REMEMBER | STASH |
RETRIEVE | ABSTAIN | FROM | REINSTATE | COMEFROM |
READOUT | WRITEIN | TRYAGAIN | GIVEUP }
NEXT = { ["NEXT"] }
RESUME = { ["RESUME"] }
FORGET = { ["FORGET"] }
IGNORE = { ["IGNORE"] }
REMEMBER = { ["REMEMBER"] }
STASH = { ["STASH"] }
RETRIEVE = { ["RETRIEVE"] }
ABSTAIN = { ["ABSTAIN"] }
FROM = { ["FROM"] }
REINSTATE = { ["REINSTATE"] }
COMEFROM = { ["COME"] ~ ["FROM"] }
READOUT = { ["READ"] ~ ["OUT"] }
WRITEIN = { ["WRITE"] ~ ["IN"] }
TRYAGAIN = { ["TRY"] ~ ["AGAIN"] }
GIVEUP = { ["GIVE"] ~ ["UP"] }

gerund = _{ CALCULATING | NEXTING | RESUMING | FORGETTING |
IGNORING | REMEMBERING | STASHING | RETRIEVING |
ABSTAINING | REINSTATING | COMINGFROM | READINGOUT |
WRITINGIN | TRYINGAGAIN }
CALCULATING = { ["CALCULATING"] }
NEXTING = { ["NEXTING"] }
RESUMING = { ["RESUMING"] }
FORGETTING = { ["FORGETTING"] }
IGNORING = { ["IGNORING"] }
REMEMBERING = { ["REMEMBERING"] }
STASHING = { ["STASHING"] }
RETRIEVING = { ["RETRIEVING"] }
ABSTAINING = { ["ABSTAINING"] }
REINSTATING = { ["REINSTATING"] }
COMINGFROM = { ["COMING"] ~ ["FROM"] }
READINGOUT = { ["READING"] ~ ["OUT"] }
WRITINGIN = { ["WRITING"] ~ ["IN"] }
TRYINGAGAIN = { ["TRYING"] ~ ["AGAIN"] }

designator = _{ SPOT | TWOSPOT | TAIL | HYBRID }
SPOT = { ["."] }
TWOSPOT = { [":"] }
TAIL = { [","] }
HYBRID = { [";"] }

operator = _{ RABBITEARS | SPARK | MONEY | SQUIGGLE |
AMPERSAND | BOOK | WHAT }
RABBITEARS = { ["\""] }
SPARK = { ["'"] }
MONEY = { ["$"] | ["¢"] | ["¤"] | ["£"] | ["€"] }
SQUIGGLE = { ["~"] }
AMPERSAND = { ["&"] }
BOOK = { ["V"] }
WHAT = { ["?"] | ["∀"] }
}
}
#[derive(Parser)]
#[grammar = "lex.pest"]
struct PestLexer;

pub struct SrcToken {
pub line: SrcLine,
Expand All @@ -116,10 +38,10 @@ pub struct SrcToken {
}

pub struct Lexer<'a> {
rdp: Rdp<StringInput<'a>>,
rdpline: SrcLine,
stash: Vec<SrcToken>,
lastline: SrcLine,
inner: Pairs<'a, Rule>,
startline: SrcLine,
stash: Vec<SrcToken>,
lastline: SrcLine,
}

impl<'a> Iterator for Lexer<'a> {
Expand All @@ -138,38 +60,23 @@ impl<'a> Iterator for Lexer<'a> {

impl<'a> Lexer<'a> {
fn inner_next(&mut self) -> Option<SrcToken> {
// if we have some tokens stashed, just emit them
if !self.stash.is_empty() {
return self.stash.pop();
}
// else, request a new token from the lexer
while self.rdp.token() {
let line = self.rdpline;
// the queue can only ever consist of either a single token,
// or a token and an "inner" whitespace token
while let Some(tok) = self.rdp.queue_mut().pop() {
// jump over whitespace, but count up the line breaks
if tok.rule == Rule::whitespace {
self.rdpline += self.rdp.input().slice(tok.start, tok.end)
.chars().filter(|&ch| ch == '\n').count();
continue;
}
// convert from pest's Token to SrcToken
let srctoken = if tok.rule == Rule::NUMBER {
let text = self.rdp.input().slice(tok.start, tok.end);
SrcToken { line: line, rule: Rule::NUMBER,
value: text.parse().unwrap_or(u32::max_value()) }
} else if tok.rule == Rule::WOW {
self.stash.pop().or_else(|| {
self.inner.next().map(|pair| {
let rule = pair.as_rule();
let text = pair.as_str();
let line = pair.into_span().end_pos().line_col().0 - 1 + self.startline;
// convert into SrcToken
if rule == Rule::NUMBER {
SrcToken { line, rule, value: text.trim().parse().unwrap_or(u32::max_value()) }
} else if rule == Rule::WOW {
// handle ! = '. combination
self.stash.push(SrcToken { line: line, rule: Rule::SPOT, value: 0 });
SrcToken { line: line, rule: Rule::SPARK, value: 0 }
SrcToken { line, rule: Rule::SPARK, value: 0 }
} else {
SrcToken { line: line, rule: tok.rule, value: 0 }
};
return Some(srctoken);
}
}
None
SrcToken { line, rule, value: 0 }
}
})
})
}

pub fn peek(&mut self) -> Option<Rule> {
Expand All @@ -195,7 +102,7 @@ impl<'a> Lexer<'a> {
}

pub fn lex(s: &str, startline: usize) -> Lexer {
let input = StringInput::new(s);
Lexer { rdp: Rdp::new(input), rdpline: startline,
stash: vec![], lastline: startline }
// always succeeds since we have an UNKNOWN token
let inner = PestLexer::parse(Rule::tokens, s).unwrap();
Lexer { inner, startline, stash: vec![], lastline: startline }
}
3 changes: 2 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@
///
/// Parses arguments, calls parser, optimizer, interpreter or code generator.
#[macro_use]
extern crate pest;
#[macro_use]
extern crate pest_derive;
extern crate getopts;
extern crate rand;
extern crate time;
Expand Down

0 comments on commit d2d342c

Please sign in to comment.