First shot at a lexer.

This commit is contained in:
Daniel Jones 2013-02-25 21:25:25 -08:00
commit 9ba796161d
2 changed files with 337 additions and 0 deletions

313
lex.go Normal file
View file

@ -0,0 +1,313 @@
package main
import (
"fmt"
"strings"
"unicode/utf8"
)
type tokenType int
const eof rune = '\000'
const (
tokenError tokenType = iota
tokenBareString
tokenQuotedString
tokenInclude
tokenColon
tokenAssign
tokenRecipe
)
func (typ tokenType) String() string {
switch typ {
case tokenError: return "[Error]"
case tokenBareString: return "[BareString]"
case tokenQuotedString: return "[QuotedString]"
case tokenInclude: return "[Include]"
case tokenColon: return "[Colon]"
case tokenAssign: return "[Assign]"
case tokenRecipe: return "[Recipe]"
}
return "[MysteryToken]"
}
type token struct {
typ tokenType // token type
val string // token string
}
func (t *token) String() string {
if t.typ == tokenError {
return t.val
}
return fmt.Sprintf("%s %q", t.typ, t.val)
}
type lexer struct {
input string // input string to be lexed
output chan token // channel on which tokens are sent
start int // token beginning
pos int // position within input
line int // line within input
col int // column within input
errmsg string // set to an appropriate error message when necessary
indented bool // true if the only whitespace so far on this line
}
// A lexerStateFun is simultaneously the the state of the lexer and the next
// action the lexer will perform.
type lexerStateFun func (*lexer) lexerStateFun
func (l *lexer) lexerror(what string) {
l.errmsg = what
l.emit(tokenError)
}
// Return the nth character without advancing.
func (l *lexer) peekN(n int) (c rune) {
pos := l.pos
var width int
i := 0
for ; i <= n && pos < len(l.input); i++ {
c, width = utf8.DecodeRuneInString(l.input[pos:])
pos += width
}
if i <= n {
return eof
}
return
}
// Return the next character without advancing.
func (l *lexer) peek() rune {
return l.peekN(0)
}
// Consume and return the next character in the lexer input.
func (l *lexer) next() rune {
if l.pos >= len(l.input) {
return eof
}
c, width := utf8.DecodeRuneInString(l.input[l.pos:])
l.pos += width
if c == '\n' {
l.col = 0
l.line += 1
l.indented = true
} else {
l.col += 1
if strings.IndexRune(" \t", c) < 0 {
l.indented = false
}
}
return c
}
// Skip and return the next character in the lexer input.
func (l *lexer) skip() {
l.next()
l.start = l.pos
}
func (l *lexer) emit(typ tokenType) {
l.output <- token{typ, l.input[l.start:l.pos]}
l.start = l.pos
}
// Consume the next run if it is in the given string.
func (l *lexer) accept(valid string) bool {
if strings.IndexRune(valid, l.peek()) >= 0 {
l.next()
return true
}
return false
}
// Consume characters from the valid string until the next is not.
func (l *lexer) acceptRun(valid string) int {
prevpos := l.pos
for strings.IndexRune(valid, l.peek()) >= 0 {
l.next()
}
return l.pos - prevpos
}
// Accept until something from the given string is encountered.
func (l *lexer) acceptUntil(invalid string) {
for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 {
l.next()
}
}
// Skip characters from the valid string until the next is not.
func (l* lexer) skipRun(valid string) int {
prevpos := l.pos
for strings.IndexRune(valid, l.peek()) >= 0 {
l.skip()
}
return l.pos - prevpos
}
// Skip until something from the given string is encountered.
func (l *lexer) skipUntil(invalid string) {
for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 {
l.skip()
}
}
// Start a new lexer to lex the given input.
func lex(input string) (*lexer, chan token) {
l := &lexer{input: input, output: make(chan token)}
go l.run()
return l, l.output
}
func (l *lexer) run() {
for state := lexTopLevel; state != nil; {
state = state(l)
}
close(l.output)
}
// What do we need?
// A function that consumes non-newline whitespace.
// A way of determining if the current line might be a recipe.
func lexTopLevel (l *lexer) lexerStateFun {
for {
l.skipRun(" \t\n\r")
if l.peek() == '\'' && l.peekN(1) == '\n' {
l.next()
l.next()
l.indented = false
} else {
break
}
}
if l.indented && l.col > 0 {
return lexRecipe
}
c := l.peek()
switch c {
case eof: return nil
case '#': return lexComment
case '<': return lexInclude
case '"': return lexDoubleQuote
case '\'': return lexSingleQuote
case ':': return lexColon
case '=': return lexAssign
}
return lexBareString
}
func lexColon (l* lexer) lexerStateFun {
l.next()
l.emit(tokenColon)
return lexTopLevel
}
func lexAssign (l* lexer) lexerStateFun {
l.next()
l.emit(tokenAssign)
return lexTopLevel
}
func lexComment (l* lexer) lexerStateFun {
l.skip() // '#'
l.skipUntil("\n")
return lexTopLevel
}
func lexInclude (l* lexer) lexerStateFun {
l.skip() // '<'
l.skipRun(" \t\n\r")
l.acceptUntil("\n\r")
l.emit(tokenInclude)
return lexTopLevel
}
func lexDoubleQuote (l *lexer) lexerStateFun {
l.skip() // '"'
for l.peek() != '"' {
l.acceptUntil("\\\"")
if l.accept("\\") {
l.accept("\"")
}
}
l.emit(tokenQuotedString)
l.skip() // skip '"'
return lexTopLevel
}
func lexSingleQuote (l *lexer) lexerStateFun {
l.skip() // '\''
l.acceptUntil("'")
l.emit(tokenQuotedString)
l.skip() // '\''
return lexTopLevel
}
func lexRecipe (l *lexer) lexerStateFun {
for {
l.acceptUntil("\n")
l.acceptRun(" \t\n\r")
if !l.indented || l.col == 0 {
break
}
}
// TODO: don't emit if there is only whitespace in the recipe
l.emit(tokenRecipe)
return lexTopLevel
}
func lexBareString (l *lexer) lexerStateFun {
// TODO: allow escaping spaces and tabs?
// TODO: allow adjacent quoted string, e.g.: foo"bar"baz?
l.acceptUntil(" \t\n\r\\=:#'\"")
l.emit(tokenBareString)
return lexTopLevel
}

24
mk.go Normal file
View file

@ -0,0 +1,24 @@
package main
import (
"fmt"
"os"
"io/ioutil"
)
func main() {
input, _ := ioutil.ReadAll(os.Stdin)
l, tokens := lex(string(input))
for t := range tokens {
if t.typ == tokenError {
fmt.Printf("Error: %s", l.errmsg)
break
}
fmt.Println(t.String())
}
}