package main import ( "fmt" "strings" "unicode/utf8" ) type tokenType int const eof rune = '\000' // Rune's that cannot be part of a bare (unquoted) string. const nonBareRunes = " \t\n\r\\=:#'\"$" // Return true if the string contains whitespace only. func onlyWhitespace(s string) bool { return strings.IndexAny(s, " \t\r\n") < 0 } const ( tokenError tokenType = iota tokenNewline tokenWord tokenPipeInclude tokenRedirInclude tokenColon tokenAssign tokenRecipe ) func (typ tokenType) String() string { switch typ { case tokenError: return "[Error]" case tokenNewline: return "[Newline]" case tokenWord: return "[Word]" case tokenPipeInclude: return "[PipeInclude]" case tokenRedirInclude: return "[RedirInclude]" case tokenColon: return "[Colon]" case tokenAssign: return "[Assign]" case tokenRecipe: return "[Recipe]" } return "[MysteryToken]" } type token struct { typ tokenType // token type val string // token string line int // line where it was found col int // column on which the token began } func (t *token) String() string { if t.typ == tokenError { return t.val } else if t.typ == tokenNewline { return "\\n" } return t.val } type lexer struct { input string // input string to be lexed output chan token // channel on which tokens are sent start int // token beginning startcol int // column on which the token begins pos int // position within input line int // line within input col int // column within input errmsg string // set to an appropriate error message when necessary indented bool // true if the only whitespace so far on this line barewords bool // lex only a sequence of words } // A lexerStateFun is simultaneously the the state of the lexer and the next // action the lexer will perform. type lexerStateFun func(*lexer) lexerStateFun func (l *lexer) lexerror(what string) { if l.errmsg == "" { l.errmsg = what } l.emit(tokenError) } // Return the nth character without advancing. func (l *lexer) peekN(n int) (c rune) { pos := l.pos var width int i := 0 for ; i <= n && pos < len(l.input); i++ { c, width = utf8.DecodeRuneInString(l.input[pos:]) pos += width } if i <= n { return eof } return } // Return the next character without advancing. func (l *lexer) peek() rune { return l.peekN(0) } // Consume and return the next character in the lexer input. func (l *lexer) next() rune { if l.pos >= len(l.input) { return eof } c, width := utf8.DecodeRuneInString(l.input[l.pos:]) l.pos += width if c == '\n' { l.col = 0 l.line += 1 l.indented = true } else { l.col += 1 if strings.IndexRune(" \t", c) < 0 { l.indented = false } } return c } // Skip and return the next character in the lexer input. func (l *lexer) skip() { l.next() l.start = l.pos l.startcol = l.col } func (l *lexer) emit(typ tokenType) { l.output <- token{typ, l.input[l.start:l.pos], l.line, l.startcol} l.start = l.pos l.startcol = 0 } // Consume the next run if it is in the given string. func (l *lexer) accept(valid string) bool { if strings.IndexRune(valid, l.peek()) >= 0 { l.next() return true } return false } // Skip the next rune if it is in the valid string. Return true if it was // skipped. func (l *lexer) ignore(valid string) bool { if strings.IndexRune(valid, l.peek()) >= 0 { l.skip() return true } return false } // Consume characters from the valid string until the next is not. func (l *lexer) acceptRun(valid string) int { prevpos := l.pos for strings.IndexRune(valid, l.peek()) >= 0 { l.next() } return l.pos - prevpos } // Accept until something from the given string is encountered. func (l *lexer) acceptUntil(invalid string) { for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 { l.next() } if l.peek() == eof { l.lexerror(fmt.Sprintf("end of file encountered while looking for one of: %s", invalid)) } } // Accept until something from the given string is encountered, or the end of th // file func (l *lexer) acceptUntilOrEof(invalid string) { for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 { l.next() } } // Skip characters from the valid string until the next is not. func (l *lexer) skipRun(valid string) int { prevpos := l.pos for strings.IndexRune(valid, l.peek()) >= 0 { l.skip() } return l.pos - prevpos } // Skip until something from the given string is encountered. func (l *lexer) skipUntil(invalid string) { for l.pos < len(l.input) && strings.IndexRune(invalid, l.peek()) < 0 { l.skip() } if l.peek() == eof { l.lexerror(fmt.Sprintf("end of file encountered while looking for one of: %s", invalid)) } } // Start a new lexer to lex the given input. func lex(input string) (*lexer, chan token) { l := &lexer{input: input, output: make(chan token), line: 1, col: 0, indented: true} go l.run() return l, l.output } func lexWords(input string) (*lexer, chan token) { l := &lexer{input: input, output: make(chan token), line: 1, col: 0, indented: true, barewords: true} go l.run() return l, l.output } func (l *lexer) run() { for state := lexTopLevel; state != nil; { state = state(l) } close(l.output) } func lexTopLevel(l *lexer) lexerStateFun { for { l.skipRun(" \t\r") // emit a newline token if we are ending a non-empty line. if l.peek() == '\n' && !l.indented { l.next() if l.barewords { return nil } else { l.emit(tokenNewline) } } l.skipRun(" \t\r\n") if l.peek() == '\\' && l.peekN(1) == '\n' { l.next() l.next() l.indented = false } else { break } } if l.indented && l.col > 0 { return lexRecipe } c := l.peek() switch c { case eof: return nil case '#': return lexComment case '<': return lexInclude case ':': return lexColon case '=': return lexAssign case '"': return lexDoubleQuotedWord case '\'': return lexSingleQuotedWord case '`': return lexBackQuotedWord } return lexBareWord } func lexColon(l *lexer) lexerStateFun { l.next() l.emit(tokenColon) return lexTopLevel } func lexAssign(l *lexer) lexerStateFun { l.next() l.emit(tokenAssign) return lexTopLevel } func lexComment(l *lexer) lexerStateFun { l.skip() // '#' l.skipUntil("\n") return lexTopLevel } func lexInclude(l *lexer) lexerStateFun { l.next() // '<' if l.accept("|") { l.emit(tokenPipeInclude) } else { l.emit(tokenRedirInclude) } return lexTopLevel } func lexDoubleQuotedWord(l *lexer) lexerStateFun { l.next() // '"' for l.peek() != '"' && l.peek() != eof { l.acceptUntil("\\\"") if l.accept("\\") { l.accept("\"") } } if l.peek() == eof { l.lexerror("end of file encountered while parsing a quoted string.") } l.next() // '"' return lexBareWord } func lexBackQuotedWord(l *lexer) lexerStateFun { l.next() // '`' l.acceptUntil("`") l.next() // '`' return lexBareWord } func lexSingleQuotedWord(l *lexer) lexerStateFun { l.next() // '\'' l.acceptUntil("'") l.next() // '\'' return lexBareWord } func lexRecipe(l *lexer) lexerStateFun { for { l.acceptUntilOrEof("\n") l.acceptRun(" \t\n\r") if !l.indented || l.col == 0 { break } } if !onlyWhitespace(l.input[l.start:l.pos]) { l.emit(tokenRecipe) } return lexTopLevel } func lexBareWord(l *lexer) lexerStateFun { l.acceptUntil(nonBareRunes) c := l.peek() if c == '"' { return lexDoubleQuotedWord } else if c == '\'' { return lexSingleQuotedWord } else if c == '`' { return lexBackQuotedWord } else if c == '\\' { c1 := l.peekN(1) if c1 == '\n' || c1 == '\r' { if l.start < l.pos { l.emit(tokenWord) } l.skip() l.skip() return lexTopLevel } else { l.next() l.next() return lexBareWord } } else if c == '$' { c1 := l.peekN(1) if c1 == '{' { return lexBracketExpansion } else { l.next() return lexBareWord } } if l.start < l.pos { l.emit(tokenWord) } return lexTopLevel } func lexBracketExpansion(l *lexer) lexerStateFun { l.next() // '$' l.next() // '{' l.acceptUntil("}") l.next() // '}' return lexBareWord }