mk/lex.go


package main

import (
    "fmt"
    "strings"
    "unicode/utf8"
)

type tokenType int

const eof rune = '\000'

const (
    tokenError tokenType = iota
    tokenBareString
    tokenQuotedString
    tokenInclude
    tokenColon
    tokenAssign
    tokenRecipe
)


func (typ tokenType) String() string {
    switch typ {
    case tokenError:        return "[Error]"
    case tokenBareString:   return "[BareString]"
    case tokenQuotedString: return "[QuotedString]"
    case tokenInclude:      return "[Include]"
    case tokenColon:        return "[Colon]"
    case tokenAssign:       return "[Assign]"
    case tokenRecipe:       return "[Recipe]"
    }
    return "[MysteryToken]"
}


type token struct {
    typ tokenType // token type
    val string    // token string
}


func (t *token) String() string {
    if t.typ == tokenError {
        return t.val
    }

    return fmt.Sprintf("%s %q", t.typ, t.val)
}


type lexer struct {
    input    string     // input string to be lexed
    output   chan token // channel on which tokens are sent
    start    int        // token beginning
    pos      int        // position within input
    line     int        // line within input
    col      int        // column within input
    errmsg   string     // set to an appropriate error message when necessary
    indented bool       // true if the only whitespace so far on this line
}


// A lexerStateFun is simultaneously the the state of the lexer and the next
// action the lexer will perform.
type lexerStateFun func (*lexer) lexerStateFun


func (l *lexer) lexerror(what string) {
    l.errmsg = what
    l.emit(tokenError)
}


// Return the nth character without advancing.
func (l *lexer) peekN(n int) (c rune) {
    pos := l.pos
    var width int
    i := 0
    for ; i <= n && pos < len(l.input); i++ {
        c, width = utf8.DecodeRuneInString(l.input[pos:])
        pos += width
    }

    if i <= n {
        return eof
    }

    return
}


// Return the next character without advancing.
func (l *lexer) peek() rune {
    return l.peekN(0)
}


// Consume and return the next character in the lexer input.
func (l *lexer) next() rune {
    if l.pos >= len(l.input) {
        return eof
    }
    c, width := utf8.DecodeRuneInString(l.input[l.pos:])
    l.pos += width

    if c == '\n' {
        l.col = 0
        l.line += 1
        l.indented = true
    } else {
        l.col += 1
        if strings.IndexRune(" \t", c) < 0 {
            l.indented = false
        }
    }

    return c
}


// Skip and return the next character in the lexer input.
func (l *lexer) skip() {
    l.next()
    l.start = l.pos
}


func (l *lexer) emit(typ tokenType) {
    l.output <- token{typ, l.input[l.start:l.pos]}
    l.start = l.pos
}


// Consume the next run if it is in the given string.
func (l *lexer) accept(valid string) bool {
    if strings.IndexRune(valid, l.peek()) >= 0 {
        l.next()
        return true
    }
    return false
}


// Consume characters from the valid string until the next is not.
func (l *lexer) acceptRun(valid string) int {
    prevpos := l.pos
    for strings.IndexRune(valid, l.peek()) >= 0 {
        l.next()
    }
    return l.pos - prevpos
}


// Accept until something from the given string is encountered.
func (l *lexer) acceptUntil(invalid string) {
    for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 {
        l.next()
    }
}


// Skip characters from the valid string until the next is not.
func (l* lexer) skipRun(valid string) int {
    prevpos := l.pos
    for strings.IndexRune(valid, l.peek()) >= 0 {
        l.skip()
    }
    return l.pos - prevpos
}


// Skip until something from the given string is encountered.
func (l *lexer) skipUntil(invalid string) {
    for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 {
        l.skip()
    }
}


// Start a new lexer to lex the given input.
func lex(input string) (*lexer, chan token) {
    l := &lexer{input: input, output: make(chan token)}
    go l.run()
    return l, l.output
}


func (l *lexer) run() {
    for state := lexTopLevel; state != nil; {
        state = state(l)
    }
    close(l.output)
}


// What do we need?
// A function that consumes non-newline whitespace.
// A way of determining if the current line might be a recipe.


func lexTopLevel (l *lexer) lexerStateFun {

    for {
        l.skipRun(" \t\n\r")
        if l.peek() == '\'' && l.peekN(1) == '\n' {
            l.next()
            l.next()
            l.indented = false
        } else {
            break
        }
    }

    if l.indented && l.col > 0 {
        return lexRecipe
    }

    c := l.peek()
    switch c {
    case eof: return nil
    case '#': return lexComment
    case '<': return lexInclude
    case '"': return lexDoubleQuote
    case '\'': return lexSingleQuote
    case ':': return lexColon
    case '=': return lexAssign
    }

    return lexBareString
}


func lexColon (l* lexer) lexerStateFun {
    l.next()
    l.emit(tokenColon)
    return lexTopLevel
}


func lexAssign (l* lexer) lexerStateFun {
    l.next()
    l.emit(tokenAssign)
    return lexTopLevel
}


func lexComment (l* lexer) lexerStateFun {
    l.skip() // '#'
    l.skipUntil("\n")
    return lexTopLevel
}


func lexInclude (l* lexer) lexerStateFun {
    l.skip() // '<'
    l.skipRun(" \t\n\r")
    l.acceptUntil("\n\r")
    l.emit(tokenInclude)
    return lexTopLevel
}


func lexDoubleQuote (l *lexer) lexerStateFun {
    l.skip() // '"'
    for l.peek() != '"' {
        l.acceptUntil("\\\"")
        if l.accept("\\") {
            l.accept("\"")
        }
    }
    l.emit(tokenQuotedString)
    l.skip() // skip '"'
    return lexTopLevel
}


func lexSingleQuote (l *lexer) lexerStateFun {
    l.skip() // '\''
    l.acceptUntil("'")
    l.emit(tokenQuotedString)
    l.skip() // '\''
    return lexTopLevel
}


func lexRecipe (l *lexer) lexerStateFun {

    for {
        l.acceptUntil("\n")
        l.acceptRun(" \t\n\r")
        if !l.indented || l.col == 0 {
            break
        }
    }

    // TODO: don't emit if there is only whitespace in the recipe
    l.emit(tokenRecipe)
    return lexTopLevel
}


func lexBareString (l *lexer) lexerStateFun {
    // TODO: allow escaping spaces and tabs?
    // TODO: allow adjacent quoted string, e.g.: foo"bar"baz?
    l.acceptUntil(" \t\n\r\\=:#'\"")
    l.emit(tokenBareString)
    return lexTopLevel
}
First shot at a lexer. 2013-02-25 21:25:25 -08:00
			`package main`

			`import (`
			`"fmt"`
			`"strings"`
			`"unicode/utf8"`
			`)`

			`type tokenType int`

			`const eof rune = '\000'`

			`const (`
			`tokenError tokenType = iota`
			`tokenBareString`
			`tokenQuotedString`
			`tokenInclude`
			`tokenColon`
			`tokenAssign`
			`tokenRecipe`
			`)`


			`func (typ tokenType) String() string {`
			`switch typ {`
			`case tokenError: return "[Error]"`
			`case tokenBareString: return "[BareString]"`
			`case tokenQuotedString: return "[QuotedString]"`
			`case tokenInclude: return "[Include]"`
			`case tokenColon: return "[Colon]"`
			`case tokenAssign: return "[Assign]"`
			`case tokenRecipe: return "[Recipe]"`
			`}`
			`return "[MysteryToken]"`
			`}`


			`type token struct {`
			`typ tokenType // token type`
			`val string // token string`
			`}`


			`func (t *token) String() string {`
			`if t.typ == tokenError {`
			`return t.val`
			`}`

			`return fmt.Sprintf("%s %q", t.typ, t.val)`
			`}`


			`type lexer struct {`
			`input string // input string to be lexed`
			`output chan token // channel on which tokens are sent`
			`start int // token beginning`
			`pos int // position within input`
			`line int // line within input`
			`col int // column within input`
			`errmsg string // set to an appropriate error message when necessary`
			`indented bool // true if the only whitespace so far on this line`
			`}`


			`// A lexerStateFun is simultaneously the the state of the lexer and the next`
			`// action the lexer will perform.`
			`type lexerStateFun func (*lexer) lexerStateFun`


			`func (l *lexer) lexerror(what string) {`
			`l.errmsg = what`
			`l.emit(tokenError)`
			`}`


			`// Return the nth character without advancing.`
			`func (l *lexer) peekN(n int) (c rune) {`
			`pos := l.pos`
			`var width int`
			`i := 0`
			`for ; i <= n && pos < len(l.input); i++ {`
			`c, width = utf8.DecodeRuneInString(l.input[pos:])`
			`pos += width`
			`}`

			`if i <= n {`
			`return eof`
			`}`

			`return`
			`}`


			`// Return the next character without advancing.`
			`func (l *lexer) peek() rune {`
			`return l.peekN(0)`
			`}`


			`// Consume and return the next character in the lexer input.`
			`func (l *lexer) next() rune {`
			`if l.pos >= len(l.input) {`
			`return eof`
			`}`
			`c, width := utf8.DecodeRuneInString(l.input[l.pos:])`
			`l.pos += width`

			`if c == '\n' {`
			`l.col = 0`
			`l.line += 1`
			`l.indented = true`
			`} else {`
			`l.col += 1`
			`if strings.IndexRune(" \t", c) < 0 {`
			`l.indented = false`
			`}`
			`}`

			`return c`
			`}`


			`// Skip and return the next character in the lexer input.`
			`func (l *lexer) skip() {`
			`l.next()`
			`l.start = l.pos`
			`}`


			`func (l *lexer) emit(typ tokenType) {`
			`l.output <- token{typ, l.input[l.start:l.pos]}`
			`l.start = l.pos`
			`}`


			`// Consume the next run if it is in the given string.`
			`func (l *lexer) accept(valid string) bool {`
			`if strings.IndexRune(valid, l.peek()) >= 0 {`
			`l.next()`
			`return true`
			`}`
			`return false`
			`}`


			`// Consume characters from the valid string until the next is not.`
			`func (l *lexer) acceptRun(valid string) int {`
			`prevpos := l.pos`
			`for strings.IndexRune(valid, l.peek()) >= 0 {`
			`l.next()`
			`}`
			`return l.pos - prevpos`
			`}`


			`// Accept until something from the given string is encountered.`
			`func (l *lexer) acceptUntil(invalid string) {`
			`for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 {`
			`l.next()`
			`}`
			`}`


			`// Skip characters from the valid string until the next is not.`
			`func (l* lexer) skipRun(valid string) int {`
			`prevpos := l.pos`
			`for strings.IndexRune(valid, l.peek()) >= 0 {`
			`l.skip()`
			`}`
			`return l.pos - prevpos`
			`}`


			`// Skip until something from the given string is encountered.`
			`func (l *lexer) skipUntil(invalid string) {`
			`for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 {`
			`l.skip()`
			`}`
			`}`


			`// Start a new lexer to lex the given input.`
			`func lex(input string) (*lexer, chan token) {`
			`l := &lexer{input: input, output: make(chan token)}`
			`go l.run()`
			`return l, l.output`
			`}`


			`func (l *lexer) run() {`
			`for state := lexTopLevel; state != nil; {`
			`state = state(l)`
			`}`
			`close(l.output)`
			`}`


			`// What do we need?`
			`// A function that consumes non-newline whitespace.`
			`// A way of determining if the current line might be a recipe.`


			`func lexTopLevel (l *lexer) lexerStateFun {`

			`for {`
			`l.skipRun(" \t\n\r")`
			`if l.peek() == '\'' && l.peekN(1) == '\n' {`
			`l.next()`
			`l.next()`
			`l.indented = false`
			`} else {`
			`break`
			`}`
			`}`

			`if l.indented && l.col > 0 {`
			`return lexRecipe`
			`}`

			`c := l.peek()`
			`switch c {`
			`case eof: return nil`
			`case '#': return lexComment`
			`case '<': return lexInclude`
			`case '"': return lexDoubleQuote`
			`case '\'': return lexSingleQuote`
			`case ':': return lexColon`
			`case '=': return lexAssign`
			`}`

			`return lexBareString`
			`}`


			`func lexColon (l* lexer) lexerStateFun {`
			`l.next()`
			`l.emit(tokenColon)`
			`return lexTopLevel`
			`}`


			`func lexAssign (l* lexer) lexerStateFun {`
			`l.next()`
			`l.emit(tokenAssign)`
			`return lexTopLevel`
			`}`


			`func lexComment (l* lexer) lexerStateFun {`
			`l.skip() // '#'`
			`l.skipUntil("\n")`
			`return lexTopLevel`
			`}`


			`func lexInclude (l* lexer) lexerStateFun {`
			`l.skip() // '<'`
			`l.skipRun(" \t\n\r")`
			`l.acceptUntil("\n\r")`
			`l.emit(tokenInclude)`
			`return lexTopLevel`
			`}`


			`func lexDoubleQuote (l *lexer) lexerStateFun {`
			`l.skip() // '"'`
			`for l.peek() != '"' {`
			`l.acceptUntil("\\\"")`
			`if l.accept("\\") {`
			`l.accept("\"")`
			`}`
			`}`
			`l.emit(tokenQuotedString)`
			`l.skip() // skip '"'`
			`return lexTopLevel`
			`}`


			`func lexSingleQuote (l *lexer) lexerStateFun {`
			`l.skip() // '\''`
			`l.acceptUntil("'")`
			`l.emit(tokenQuotedString)`
			`l.skip() // '\''`
			`return lexTopLevel`
			`}`


			`func lexRecipe (l *lexer) lexerStateFun {`

			`for {`
			`l.acceptUntil("\n")`
			`l.acceptRun(" \t\n\r")`
			`if !l.indented \|\| l.col == 0 {`
			`break`
			`}`
			`}`

			`// TODO: don't emit if there is only whitespace in the recipe`
			`l.emit(tokenRecipe)`
			`return lexTopLevel`
			`}`


			`func lexBareString (l *lexer) lexerStateFun {`
			`// TODO: allow escaping spaces and tabs?`
			`// TODO: allow adjacent quoted string, e.g.: foo"bar"baz?`
			`l.acceptUntil(" \t\n\r\\=:#'\"")`
			`l.emit(tokenBareString)`
			`return lexTopLevel`
			`}`