mk/lex.go

410 lines
8 KiB
Go
Raw Normal View History

2013-02-25 21:25:25 -08:00
package main
import (
2013-03-10 00:34:42 -08:00
"fmt"
2013-02-26 11:33:07 -08:00
"strings"
"unicode/utf8"
2013-02-25 21:25:25 -08:00
)
type tokenType int
const eof rune = '\000'
2013-03-09 20:54:13 -08:00
// Rune's that cannot be part of a bare (unquoted) string.
2014-08-04 12:30:57 -07:00
const nonBareRunes = " \t\n\r\\=:#'\"$"
2013-03-09 20:54:13 -08:00
// Return true if the string contains whitespace only.
func onlyWhitespace(s string) bool {
2013-03-10 00:34:42 -08:00
return strings.IndexAny(s, " \t\r\n") < 0
2013-03-09 20:54:13 -08:00
}
2013-02-25 21:25:25 -08:00
const (
2013-02-26 11:33:07 -08:00
tokenError tokenType = iota
tokenNewline
2013-02-26 22:41:25 -08:00
tokenWord
2013-02-26 11:33:07 -08:00
tokenPipeInclude
tokenRedirInclude
tokenColon
tokenAssign
tokenRecipe
2013-02-25 21:25:25 -08:00
)
func (typ tokenType) String() string {
2013-02-26 11:33:07 -08:00
switch typ {
case tokenError:
return "[Error]"
case tokenNewline:
return "[Newline]"
2013-02-26 22:41:25 -08:00
case tokenWord:
return "[Word]"
2013-02-26 11:33:07 -08:00
case tokenPipeInclude:
return "[PipeInclude]"
case tokenRedirInclude:
return "[RedirInclude]"
case tokenColon:
return "[Colon]"
case tokenAssign:
return "[Assign]"
case tokenRecipe:
return "[Recipe]"
}
return "[MysteryToken]"
2013-02-25 21:25:25 -08:00
}
type token struct {
2013-02-26 11:33:07 -08:00
typ tokenType // token type
val string // token string
line int // line where it was found
2013-03-03 18:57:14 -08:00
col int // column on which the token began
2013-02-25 21:25:25 -08:00
}
func (t *token) String() string {
2013-02-26 11:33:07 -08:00
if t.typ == tokenError {
return t.val
} else if t.typ == tokenNewline {
return "\\n"
}
2013-02-25 21:25:25 -08:00
2013-02-26 11:33:07 -08:00
return t.val
2013-02-25 21:25:25 -08:00
}
type lexer struct {
input string // input string to be lexed
output chan token // channel on which tokens are sent
start int // token beginning
startcol int // column on which the token begins
pos int // position within input
line int // line within input
col int // column within input
errmsg string // set to an appropriate error message when necessary
indented bool // true if the only whitespace so far on this line
barewords bool // lex only a sequence of words
2013-02-25 21:25:25 -08:00
}
// A lexerStateFun is simultaneously the the state of the lexer and the next
// action the lexer will perform.
2013-02-26 11:33:07 -08:00
type lexerStateFun func(*lexer) lexerStateFun
2013-02-25 21:25:25 -08:00
func (l *lexer) lexerror(what string) {
2013-03-10 00:34:42 -08:00
if l.errmsg == "" {
l.errmsg = what
}
2013-02-26 11:33:07 -08:00
l.emit(tokenError)
2013-02-25 21:25:25 -08:00
}
// Return the nth character without advancing.
func (l *lexer) peekN(n int) (c rune) {
2013-02-26 11:33:07 -08:00
pos := l.pos
var width int
i := 0
for ; i <= n && pos < len(l.input); i++ {
c, width = utf8.DecodeRuneInString(l.input[pos:])
pos += width
}
if i <= n {
return eof
}
return
2013-02-25 21:25:25 -08:00
}
// Return the next character without advancing.
func (l *lexer) peek() rune {
2013-02-26 11:33:07 -08:00
return l.peekN(0)
2013-02-25 21:25:25 -08:00
}
// Consume and return the next character in the lexer input.
func (l *lexer) next() rune {
2013-02-26 11:33:07 -08:00
if l.pos >= len(l.input) {
return eof
}
c, width := utf8.DecodeRuneInString(l.input[l.pos:])
l.pos += width
if c == '\n' {
l.col = 0
l.line += 1
l.indented = true
} else {
l.col += 1
if strings.IndexRune(" \t", c) < 0 {
l.indented = false
}
}
return c
2013-02-25 21:25:25 -08:00
}
// Skip and return the next character in the lexer input.
func (l *lexer) skip() {
2013-02-26 11:33:07 -08:00
l.next()
l.start = l.pos
2013-03-03 18:57:14 -08:00
l.startcol = l.col
2013-02-25 21:25:25 -08:00
}
func (l *lexer) emit(typ tokenType) {
2013-03-03 18:57:14 -08:00
l.output <- token{typ, l.input[l.start:l.pos], l.line, l.startcol}
2013-02-26 11:33:07 -08:00
l.start = l.pos
2013-03-03 18:57:14 -08:00
l.startcol = 0
2013-02-25 21:25:25 -08:00
}
// Consume the next run if it is in the given string.
func (l *lexer) accept(valid string) bool {
2013-02-26 11:33:07 -08:00
if strings.IndexRune(valid, l.peek()) >= 0 {
l.next()
return true
}
return false
2013-02-25 21:25:25 -08:00
}
2013-02-25 23:52:08 -08:00
// Skip the next rune if it is in the valid string. Return true if it was
// skipped.
func (l *lexer) ignore(valid string) bool {
2013-02-26 11:33:07 -08:00
if strings.IndexRune(valid, l.peek()) >= 0 {
l.skip()
return true
}
return false
2013-02-25 23:52:08 -08:00
}
2013-02-25 21:25:25 -08:00
// Consume characters from the valid string until the next is not.
func (l *lexer) acceptRun(valid string) int {
2013-02-26 11:33:07 -08:00
prevpos := l.pos
for strings.IndexRune(valid, l.peek()) >= 0 {
l.next()
}
return l.pos - prevpos
2013-02-25 21:25:25 -08:00
}
// Accept until something from the given string is encountered.
func (l *lexer) acceptUntil(invalid string) {
2013-02-26 11:33:07 -08:00
for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 {
l.next()
}
2013-03-09 20:54:13 -08:00
2013-03-10 00:34:42 -08:00
if l.peek() == eof {
l.lexerror(fmt.Sprintf("end of file encountered while looking for one of: %s", invalid))
}
2013-02-25 21:25:25 -08:00
}
2013-08-19 00:05:17 -07:00
// Accept until something from the given string is encountered, or the end of th
// file
func (l *lexer) acceptUntilOrEof(invalid string) {
for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 {
l.next()
}
}
2013-02-25 21:25:25 -08:00
// Skip characters from the valid string until the next is not.
2013-02-26 11:33:07 -08:00
func (l *lexer) skipRun(valid string) int {
prevpos := l.pos
for strings.IndexRune(valid, l.peek()) >= 0 {
l.skip()
}
return l.pos - prevpos
2013-02-25 21:25:25 -08:00
}
// Skip until something from the given string is encountered.
func (l *lexer) skipUntil(invalid string) {
2013-02-26 11:33:07 -08:00
for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 {
l.skip()
}
2013-03-09 20:54:13 -08:00
2013-03-10 00:34:42 -08:00
if l.peek() == eof {
l.lexerror(fmt.Sprintf("end of file encountered while looking for one of: %s", invalid))
}
2013-02-25 21:25:25 -08:00
}
// Start a new lexer to lex the given input.
func lex(input string) (*lexer, chan token) {
2013-03-03 18:57:14 -08:00
l := &lexer{input: input, output: make(chan token), line: 1, col: 0, indented: true}
2013-02-26 11:33:07 -08:00
go l.run()
return l, l.output
2013-02-25 21:25:25 -08:00
}
func lexWords(input string) (*lexer, chan token) {
l := &lexer{input: input, output: make(chan token), line: 1, col: 0, indented: true, barewords: true}
go l.run()
return l, l.output
}
2013-02-25 21:25:25 -08:00
func (l *lexer) run() {
2013-02-26 11:33:07 -08:00
for state := lexTopLevel; state != nil; {
state = state(l)
}
close(l.output)
2013-02-25 21:25:25 -08:00
}
2013-02-26 11:33:07 -08:00
func lexTopLevel(l *lexer) lexerStateFun {
for {
l.skipRun(" \t\r")
// emit a newline token if we are ending a non-empty line.
if l.peek() == '\n' && !l.indented {
l.next()
if l.barewords {
return nil
} else {
l.emit(tokenNewline)
}
2013-02-26 11:33:07 -08:00
}
l.skipRun(" \t\r\n")
2013-02-26 22:41:25 -08:00
if l.peek() == '\\' && l.peekN(1) == '\n' {
2013-02-26 11:33:07 -08:00
l.next()
l.next()
l.indented = false
} else {
break
}
}
if l.indented && l.col > 0 {
return lexRecipe
}
c := l.peek()
switch c {
case eof:
return nil
case '#':
return lexComment
case '<':
return lexInclude
case ':':
return lexColon
case '=':
return lexAssign
2013-02-26 22:41:25 -08:00
case '"':
return lexDoubleQuotedWord
case '\'':
return lexSingleQuotedWord
case '`':
return lexBackQuotedWord
2013-02-26 11:33:07 -08:00
}
2013-02-26 22:41:25 -08:00
return lexBareWord
2013-02-25 21:25:25 -08:00
}
2013-02-26 11:33:07 -08:00
func lexColon(l *lexer) lexerStateFun {
l.next()
l.emit(tokenColon)
return lexTopLevel
2013-02-25 21:25:25 -08:00
}
2013-02-26 11:33:07 -08:00
func lexAssign(l *lexer) lexerStateFun {
l.next()
l.emit(tokenAssign)
return lexTopLevel
2013-02-25 21:25:25 -08:00
}
2013-02-26 11:33:07 -08:00
func lexComment(l *lexer) lexerStateFun {
l.skip() // '#'
l.skipUntil("\n")
return lexTopLevel
2013-02-25 21:25:25 -08:00
}
2013-02-26 11:33:07 -08:00
func lexInclude(l *lexer) lexerStateFun {
l.next() // '<'
if l.accept("|") {
l.emit(tokenPipeInclude)
} else {
l.emit(tokenRedirInclude)
}
return lexTopLevel
2013-02-25 21:25:25 -08:00
}
2013-02-26 22:41:25 -08:00
func lexDoubleQuotedWord(l *lexer) lexerStateFun {
l.next() // '"'
2013-03-09 20:54:13 -08:00
for l.peek() != '"' && l.peek() != eof {
2013-02-26 11:33:07 -08:00
l.acceptUntil("\\\"")
if l.accept("\\") {
l.accept("\"")
}
}
2013-03-09 20:54:13 -08:00
2013-03-10 00:34:42 -08:00
if l.peek() == eof {
l.lexerror("end of file encountered while parsing a quoted string.")
}
2013-03-09 20:54:13 -08:00
2013-02-26 22:41:25 -08:00
l.next() // '"'
return lexBareWord
2013-02-25 21:25:25 -08:00
}
2013-02-26 22:41:25 -08:00
func lexBackQuotedWord(l *lexer) lexerStateFun {
l.next() // '`'
l.acceptUntil("`")
l.next() // '`'
return lexBareWord
}
func lexSingleQuotedWord(l *lexer) lexerStateFun {
l.next() // '\''
2013-02-26 11:33:07 -08:00
l.acceptUntil("'")
2013-02-26 22:41:25 -08:00
l.next() // '\''
return lexBareWord
2013-02-25 21:25:25 -08:00
}
2013-02-26 11:33:07 -08:00
func lexRecipe(l *lexer) lexerStateFun {
for {
2013-08-19 00:05:17 -07:00
l.acceptUntilOrEof("\n")
2013-02-26 11:33:07 -08:00
l.acceptRun(" \t\n\r")
if !l.indented || l.col == 0 {
break
}
}
2013-03-10 00:34:42 -08:00
if !onlyWhitespace(l.input[l.start:l.pos]) {
l.emit(tokenRecipe)
}
2013-02-26 11:33:07 -08:00
return lexTopLevel
2013-02-25 21:25:25 -08:00
}
2013-02-26 22:41:25 -08:00
func lexBareWord(l *lexer) lexerStateFun {
2013-03-09 20:54:13 -08:00
l.acceptUntil(nonBareRunes)
2014-02-01 18:19:26 -08:00
c := l.peek()
if c == '"' {
2013-02-26 22:41:25 -08:00
return lexDoubleQuotedWord
2014-02-01 18:19:26 -08:00
} else if c == '\'' {
2013-02-26 22:41:25 -08:00
return lexSingleQuotedWord
2014-02-01 18:19:26 -08:00
} else if c == '`' {
2013-02-26 22:41:25 -08:00
return lexBackQuotedWord
2014-02-01 18:19:26 -08:00
} else if c == '\\' {
c1 := l.peekN(1)
if c1 == '\n' || c1 == '\r' {
if l.start < l.pos {
l.emit(tokenWord)
}
l.skip()
l.skip()
return lexTopLevel
} else {
l.next()
l.next()
return lexBareWord
}
2014-08-04 12:30:57 -07:00
} else if c == '$' {
c1 := l.peekN(1)
if c1 == '{' {
return lexBracketExpansion
} else {
l.next()
return lexBareWord
}
2013-02-26 22:41:25 -08:00
}
if l.start < l.pos {
l.emit(tokenWord)
}
2013-02-26 11:33:07 -08:00
return lexTopLevel
2013-02-25 21:25:25 -08:00
}
2014-08-04 12:30:57 -07:00
func lexBracketExpansion(l *lexer) lexerStateFun {
l.next() // '$'
l.next() // '{'
l.acceptUntil("}")
l.next() // '}'
return lexBareWord
}