From 9ba796161d6a0a7b90d25514c90cf7a5731ac818 Mon Sep 17 00:00:00 2001 From: Daniel Jones Date: Mon, 25 Feb 2013 21:25:25 -0800 Subject: [PATCH] First shot at a lexer. --- lex.go | 313 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mk.go | 24 +++++ 2 files changed, 337 insertions(+) create mode 100644 lex.go create mode 100644 mk.go diff --git a/lex.go b/lex.go new file mode 100644 index 0000000..7e2e324 --- /dev/null +++ b/lex.go @@ -0,0 +1,313 @@ + +package main + +import ( + "fmt" + "strings" + "unicode/utf8" +) + +type tokenType int + +const eof rune = '\000' + +const ( + tokenError tokenType = iota + tokenBareString + tokenQuotedString + tokenInclude + tokenColon + tokenAssign + tokenRecipe +) + + +func (typ tokenType) String() string { + switch typ { + case tokenError: return "[Error]" + case tokenBareString: return "[BareString]" + case tokenQuotedString: return "[QuotedString]" + case tokenInclude: return "[Include]" + case tokenColon: return "[Colon]" + case tokenAssign: return "[Assign]" + case tokenRecipe: return "[Recipe]" + } + return "[MysteryToken]" +} + + +type token struct { + typ tokenType // token type + val string // token string +} + + +func (t *token) String() string { + if t.typ == tokenError { + return t.val + } + + return fmt.Sprintf("%s %q", t.typ, t.val) +} + + +type lexer struct { + input string // input string to be lexed + output chan token // channel on which tokens are sent + start int // token beginning + pos int // position within input + line int // line within input + col int // column within input + errmsg string // set to an appropriate error message when necessary + indented bool // true if the only whitespace so far on this line +} + + +// A lexerStateFun is simultaneously the the state of the lexer and the next +// action the lexer will perform. +type lexerStateFun func (*lexer) lexerStateFun + + +func (l *lexer) lexerror(what string) { + l.errmsg = what + l.emit(tokenError) +} + + +// Return the nth character without advancing. +func (l *lexer) peekN(n int) (c rune) { + pos := l.pos + var width int + i := 0 + for ; i <= n && pos < len(l.input); i++ { + c, width = utf8.DecodeRuneInString(l.input[pos:]) + pos += width + } + + if i <= n { + return eof + } + + return +} + + +// Return the next character without advancing. +func (l *lexer) peek() rune { + return l.peekN(0) +} + + +// Consume and return the next character in the lexer input. +func (l *lexer) next() rune { + if l.pos >= len(l.input) { + return eof + } + c, width := utf8.DecodeRuneInString(l.input[l.pos:]) + l.pos += width + + if c == '\n' { + l.col = 0 + l.line += 1 + l.indented = true + } else { + l.col += 1 + if strings.IndexRune(" \t", c) < 0 { + l.indented = false + } + } + + return c +} + + +// Skip and return the next character in the lexer input. +func (l *lexer) skip() { + l.next() + l.start = l.pos +} + + +func (l *lexer) emit(typ tokenType) { + l.output <- token{typ, l.input[l.start:l.pos]} + l.start = l.pos +} + + +// Consume the next run if it is in the given string. +func (l *lexer) accept(valid string) bool { + if strings.IndexRune(valid, l.peek()) >= 0 { + l.next() + return true + } + return false +} + + +// Consume characters from the valid string until the next is not. +func (l *lexer) acceptRun(valid string) int { + prevpos := l.pos + for strings.IndexRune(valid, l.peek()) >= 0 { + l.next() + } + return l.pos - prevpos +} + + +// Accept until something from the given string is encountered. +func (l *lexer) acceptUntil(invalid string) { + for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 { + l.next() + } +} + + +// Skip characters from the valid string until the next is not. +func (l* lexer) skipRun(valid string) int { + prevpos := l.pos + for strings.IndexRune(valid, l.peek()) >= 0 { + l.skip() + } + return l.pos - prevpos +} + + +// Skip until something from the given string is encountered. +func (l *lexer) skipUntil(invalid string) { + for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 { + l.skip() + } +} + + +// Start a new lexer to lex the given input. +func lex(input string) (*lexer, chan token) { + l := &lexer{input: input, output: make(chan token)} + go l.run() + return l, l.output +} + + +func (l *lexer) run() { + for state := lexTopLevel; state != nil; { + state = state(l) + } + close(l.output) +} + + +// What do we need? +// A function that consumes non-newline whitespace. +// A way of determining if the current line might be a recipe. + + +func lexTopLevel (l *lexer) lexerStateFun { + + for { + l.skipRun(" \t\n\r") + if l.peek() == '\'' && l.peekN(1) == '\n' { + l.next() + l.next() + l.indented = false + } else { + break + } + } + + if l.indented && l.col > 0 { + return lexRecipe + } + + c := l.peek() + switch c { + case eof: return nil + case '#': return lexComment + case '<': return lexInclude + case '"': return lexDoubleQuote + case '\'': return lexSingleQuote + case ':': return lexColon + case '=': return lexAssign + } + + return lexBareString +} + + +func lexColon (l* lexer) lexerStateFun { + l.next() + l.emit(tokenColon) + return lexTopLevel +} + + +func lexAssign (l* lexer) lexerStateFun { + l.next() + l.emit(tokenAssign) + return lexTopLevel +} + + +func lexComment (l* lexer) lexerStateFun { + l.skip() // '#' + l.skipUntil("\n") + return lexTopLevel +} + + +func lexInclude (l* lexer) lexerStateFun { + l.skip() // '<' + l.skipRun(" \t\n\r") + l.acceptUntil("\n\r") + l.emit(tokenInclude) + return lexTopLevel +} + + +func lexDoubleQuote (l *lexer) lexerStateFun { + l.skip() // '"' + for l.peek() != '"' { + l.acceptUntil("\\\"") + if l.accept("\\") { + l.accept("\"") + } + } + l.emit(tokenQuotedString) + l.skip() // skip '"' + return lexTopLevel +} + + +func lexSingleQuote (l *lexer) lexerStateFun { + l.skip() // '\'' + l.acceptUntil("'") + l.emit(tokenQuotedString) + l.skip() // '\'' + return lexTopLevel +} + + +func lexRecipe (l *lexer) lexerStateFun { + + for { + l.acceptUntil("\n") + l.acceptRun(" \t\n\r") + if !l.indented || l.col == 0 { + break + } + } + + // TODO: don't emit if there is only whitespace in the recipe + l.emit(tokenRecipe) + return lexTopLevel +} + + +func lexBareString (l *lexer) lexerStateFun { + // TODO: allow escaping spaces and tabs? + // TODO: allow adjacent quoted string, e.g.: foo"bar"baz? + l.acceptUntil(" \t\n\r\\=:#'\"") + l.emit(tokenBareString) + return lexTopLevel +} + + diff --git a/mk.go b/mk.go new file mode 100644 index 0000000..b71eb79 --- /dev/null +++ b/mk.go @@ -0,0 +1,24 @@ + +package main + +import ( + "fmt" + "os" + "io/ioutil" +) + +func main() { + input, _ := ioutil.ReadAll(os.Stdin) + l, tokens := lex(string(input)) + + for t := range tokens { + if t.typ == tokenError { + fmt.Printf("Error: %s", l.errmsg) + break + } + + fmt.Println(t.String()) + } +} + +