From 9ba796161d6a0a7b90d25514c90cf7a5731ac818 Mon Sep 17 00:00:00 2001
From: Daniel Jones <dcjones@cs.washington.edu>
Date: Mon, 25 Feb 2013 21:25:25 -0800
Subject: [PATCH] First shot at a lexer.

---
 lex.go | 313 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mk.go  |  24 +++++
 2 files changed, 337 insertions(+)
 create mode 100644 lex.go
 create mode 100644 mk.go

diff --git a/lex.go b/lex.go
new file mode 100644
index 0000000..7e2e324
--- /dev/null
+++ b/lex.go
@@ -0,0 +1,313 @@
+
+package main
+
+import (
+    "fmt"
+    "strings"
+    "unicode/utf8"
+)
+
+type tokenType int
+
+const eof rune = '\000'
+
+const (
+    tokenError tokenType = iota
+    tokenBareString
+    tokenQuotedString
+    tokenInclude
+    tokenColon
+    tokenAssign
+    tokenRecipe
+)
+
+
+func (typ tokenType) String() string {
+    switch typ {
+    case tokenError:        return "[Error]"
+    case tokenBareString:   return "[BareString]"
+    case tokenQuotedString: return "[QuotedString]"
+    case tokenInclude:      return "[Include]"
+    case tokenColon:        return "[Colon]"
+    case tokenAssign:       return "[Assign]"
+    case tokenRecipe:       return "[Recipe]"
+    }
+    return "[MysteryToken]"
+}
+
+
+type token struct {
+    typ tokenType // token type
+    val string    // token string
+}
+
+
+func (t *token) String() string {
+    if t.typ == tokenError {
+        return t.val
+    }
+
+    return fmt.Sprintf("%s %q", t.typ, t.val)
+}
+
+
+type lexer struct {
+    input    string     // input string to be lexed
+    output   chan token // channel on which tokens are sent
+    start    int        // token beginning
+    pos      int        // position within input
+    line     int        // line within input
+    col      int        // column within input
+    errmsg   string     // set to an appropriate error message when necessary
+    indented bool       // true if the only whitespace so far on this line
+}
+
+
+// A lexerStateFun is simultaneously the the state of the lexer and the next
+// action the lexer will perform.
+type lexerStateFun func (*lexer) lexerStateFun
+
+
+func (l *lexer) lexerror(what string) {
+    l.errmsg = what
+    l.emit(tokenError)
+}
+
+
+// Return the nth character without advancing.
+func (l *lexer) peekN(n int) (c rune) {
+    pos := l.pos
+    var width int
+    i := 0
+    for ; i <= n && pos < len(l.input); i++ {
+        c, width = utf8.DecodeRuneInString(l.input[pos:])
+        pos += width
+    }
+
+    if i <= n {
+        return eof
+    }
+
+    return
+}
+
+
+// Return the next character without advancing.
+func (l *lexer) peek() rune {
+    return l.peekN(0)
+}
+
+
+// Consume and return the next character in the lexer input.
+func (l *lexer) next() rune {
+    if l.pos >= len(l.input) {
+        return eof
+    }
+    c, width := utf8.DecodeRuneInString(l.input[l.pos:])
+    l.pos += width
+
+    if c == '\n' {
+        l.col = 0
+        l.line += 1
+        l.indented = true
+    } else {
+        l.col += 1
+        if strings.IndexRune(" \t", c) < 0 {
+            l.indented = false
+        }
+    }
+
+    return c
+}
+
+
+// Skip and return the next character in the lexer input.
+func (l *lexer) skip() {
+    l.next()
+    l.start = l.pos
+}
+
+
+func (l *lexer) emit(typ tokenType) {
+    l.output <- token{typ, l.input[l.start:l.pos]}
+    l.start = l.pos
+}
+
+
+// Consume the next run if it is in the given string.
+func (l *lexer) accept(valid string) bool {
+    if strings.IndexRune(valid, l.peek()) >= 0 {
+        l.next()
+        return true
+    }
+    return false
+}
+
+
+// Consume characters from the valid string until the next is not.
+func (l *lexer) acceptRun(valid string) int {
+    prevpos := l.pos
+    for strings.IndexRune(valid, l.peek()) >= 0 {
+        l.next()
+    }
+    return l.pos - prevpos
+}
+
+
+// Accept until something from the given string is encountered.
+func (l *lexer) acceptUntil(invalid string) {
+    for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 {
+        l.next()
+    }
+}
+
+
+// Skip characters from the valid string until the next is not.
+func (l* lexer) skipRun(valid string) int {
+    prevpos := l.pos
+    for strings.IndexRune(valid, l.peek()) >= 0 {
+        l.skip()
+    }
+    return l.pos - prevpos
+}
+
+
+// Skip until something from the given string is encountered.
+func (l *lexer) skipUntil(invalid string) {
+    for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 {
+        l.skip()
+    }
+}
+
+
+// Start a new lexer to lex the given input.
+func lex(input string) (*lexer, chan token) {
+    l := &lexer{input: input, output: make(chan token)}
+    go l.run()
+    return l, l.output
+}
+
+
+func (l *lexer) run() {
+    for state := lexTopLevel; state != nil; {
+        state = state(l)
+    }
+    close(l.output)
+}
+
+
+// What do we need?
+// A function that consumes non-newline whitespace.
+// A way of determining if the current line might be a recipe.
+
+
+func lexTopLevel (l *lexer) lexerStateFun {
+
+    for {
+        l.skipRun(" \t\n\r")
+        if l.peek() == '\'' && l.peekN(1) == '\n' {
+            l.next()
+            l.next()
+            l.indented = false
+        } else {
+            break
+        }
+    }
+
+    if l.indented && l.col > 0 {
+        return lexRecipe
+    }
+
+    c := l.peek()
+    switch c {
+    case eof: return nil
+    case '#': return lexComment
+    case '<': return lexInclude
+    case '"': return lexDoubleQuote
+    case '\'': return lexSingleQuote
+    case ':': return lexColon
+    case '=': return lexAssign
+    }
+
+    return lexBareString
+}
+
+
+func lexColon (l* lexer) lexerStateFun {
+    l.next()
+    l.emit(tokenColon)
+    return lexTopLevel
+}
+
+
+func lexAssign (l* lexer) lexerStateFun {
+    l.next()
+    l.emit(tokenAssign)
+    return lexTopLevel
+}
+
+
+func lexComment (l* lexer) lexerStateFun {
+    l.skip() // '#'
+    l.skipUntil("\n")
+    return lexTopLevel
+}
+
+
+func lexInclude (l* lexer) lexerStateFun {
+    l.skip() // '<'
+    l.skipRun(" \t\n\r")
+    l.acceptUntil("\n\r")
+    l.emit(tokenInclude)
+    return lexTopLevel
+}
+
+
+func lexDoubleQuote (l *lexer) lexerStateFun {
+    l.skip() // '"'
+    for l.peek() != '"' {
+        l.acceptUntil("\\\"")
+        if l.accept("\\") {
+            l.accept("\"")
+        }
+    }
+    l.emit(tokenQuotedString)
+    l.skip() // skip '"'
+    return lexTopLevel
+}
+
+
+func lexSingleQuote (l *lexer) lexerStateFun {
+    l.skip() // '\''
+    l.acceptUntil("'")
+    l.emit(tokenQuotedString)
+    l.skip() // '\''
+    return lexTopLevel
+}
+
+
+func lexRecipe (l *lexer) lexerStateFun {
+
+    for {
+        l.acceptUntil("\n")
+        l.acceptRun(" \t\n\r")
+        if !l.indented || l.col == 0 {
+            break
+        }
+    }
+
+    // TODO: don't emit if there is only whitespace in the recipe
+    l.emit(tokenRecipe)
+    return lexTopLevel
+}
+
+
+func lexBareString (l *lexer) lexerStateFun {
+    // TODO: allow escaping spaces and tabs?
+    // TODO: allow adjacent quoted string, e.g.: foo"bar"baz?
+    l.acceptUntil(" \t\n\r\\=:#'\"")
+    l.emit(tokenBareString)
+    return lexTopLevel
+}
+
+
diff --git a/mk.go b/mk.go
new file mode 100644
index 0000000..b71eb79
--- /dev/null
+++ b/mk.go
@@ -0,0 +1,24 @@
+
+package main
+
+import (
+    "fmt"
+    "os"
+    "io/ioutil"
+)
+
+func main() {
+    input, _ := ioutil.ReadAll(os.Stdin)
+    l, tokens := lex(string(input))
+
+    for t := range tokens {
+        if t.typ == tokenError {
+            fmt.Printf("Error: %s", l.errmsg)
+            break
+        }
+
+        fmt.Println(t.String())
+    }
+}
+
+