package pongo2 import ( "fmt" "strings" "unicode/utf8" "errors" ) const ( TokenError = iota EOF TokenHTML TokenKeyword TokenIdentifier TokenString TokenNumber TokenSymbol ) var ( tokenSpaceChars = " \n\r\t" tokenIdentifierChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_" tokenIdentifierCharsWithDigits = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789" tokenDigits = "0123456789" // Available symbols in pongo2 (within filters/tag) TokenSymbols = []string{ // 3-Char symbols "{{-", "-}}", "{%-", "-%}", // 2-Char symbols "==", ">=", "<=", "&&", "||", "{{", "}}", "{%", "%}", "!=", "<>", // 1-Char symbol "(", ")", "+", "-", "*", "<", ">", "/", "^", ",", ".", "!", "|", ":", "=", "%", } // Available keywords in pongo2 TokenKeywords = []string{"in", "and", "or", "not", "true", "false", "as", "export"} ) type TokenType int type Token struct { Filename string Typ TokenType Val string Line int Col int TrimWhitespaces bool } type lexerStateFn func() lexerStateFn type lexer struct { name string input string start int // start pos of the item pos int // current pos width int // width of last rune tokens []*Token errored bool startline int startcol int line int col int inVerbatim bool verbatimName string } func (t *Token) String() string { val := t.Val if len(val) > 1000 { val = fmt.Sprintf("%s...%s", val[:10], val[len(val)-5:]) } typ := "" switch t.Typ { case TokenHTML: typ = "HTML" case TokenError: typ = "Error" case TokenIdentifier: typ = "Identifier" case TokenKeyword: typ = "Keyword" case TokenNumber: typ = "Number" case TokenString: typ = "String" case TokenSymbol: typ = "Symbol" default: typ = "Unknown" } return fmt.Sprintf("", typ, t.Typ, val, t.Line, t.Col, t.TrimWhitespaces) } func lex(name string, input string) ([]*Token, *Error) { l := &lexer{ name: name, input: input, tokens: make([]*Token, 0, 100), line: 1, col: 1, startline: 1, startcol: 1, } l.run() if l.errored { errtoken := l.tokens[len(l.tokens)-1] return nil, &Error{ Filename: name, Line: errtoken.Line, Column: errtoken.Col, Sender: "lexer", OrigError: errors.New(errtoken.Val), } } return l.tokens, nil } func (l *lexer) value() string { return l.input[l.start:l.pos] } func (l *lexer) length() int { return l.pos - l.start } func (l *lexer) emit(t TokenType) { tok := &Token{ Filename: l.name, Typ: t, Val: l.value(), Line: l.startline, Col: l.startcol, } if t == TokenString { // Escape sequence \" in strings tok.Val = strings.Replace(tok.Val, `\"`, `"`, -1) tok.Val = strings.Replace(tok.Val, `\\`, `\`, -1) } if t == TokenSymbol && len(tok.Val) == 3 && (strings.HasSuffix(tok.Val, "-") || strings.HasPrefix(tok.Val, "-")) { tok.TrimWhitespaces = true tok.Val = strings.Replace(tok.Val, "-", "", -1) } l.tokens = append(l.tokens, tok) l.start = l.pos l.startline = l.line l.startcol = l.col } func (l *lexer) next() rune { if l.pos >= len(l.input) { l.width = 0 return EOF } r, w := utf8.DecodeRuneInString(l.input[l.pos:]) l.width = w l.pos += l.width l.col += l.width return r } func (l *lexer) backup() { l.pos -= l.width l.col -= l.width } func (l *lexer) peek() rune { r := l.next() l.backup() return r } func (l *lexer) ignore() { l.start = l.pos l.startline = l.line l.startcol = l.col } func (l *lexer) accept(what string) bool { if strings.IndexRune(what, l.next()) >= 0 { return true } l.backup() return false } func (l *lexer) acceptRun(what string) { for strings.IndexRune(what, l.next()) >= 0 { } l.backup() } func (l *lexer) errorf(format string, args ...interface{}) lexerStateFn { t := &Token{ Filename: l.name, Typ: TokenError, Val: fmt.Sprintf(format, args...), Line: l.startline, Col: l.startcol, } l.tokens = append(l.tokens, t) l.errored = true l.startline = l.line l.startcol = l.col return nil } func (l *lexer) eof() bool { return l.start >= len(l.input)-1 } func (l *lexer) run() { for { // TODO: Support verbatim tag names // https://docs.djangoproject.com/en/dev/ref/templates/builtins/#verbatim if l.inVerbatim { name := l.verbatimName if name != "" { name += " " } if strings.HasPrefix(l.input[l.pos:], fmt.Sprintf("{%% endverbatim %s%%}", name)) { // end verbatim if l.pos > l.start { l.emit(TokenHTML) } w := len("{% endverbatim %}") l.pos += w l.col += w l.ignore() l.inVerbatim = false } } else if strings.HasPrefix(l.input[l.pos:], "{% verbatim %}") { // tag if l.pos > l.start { l.emit(TokenHTML) } l.inVerbatim = true w := len("{% verbatim %}") l.pos += w l.col += w l.ignore() } if !l.inVerbatim { // Ignore single-line comments {# ... #} if strings.HasPrefix(l.input[l.pos:], "{#") { if l.pos > l.start { l.emit(TokenHTML) } l.pos += 2 // pass '{#' l.col += 2 for { switch l.peek() { case EOF: l.errorf("Single-line comment not closed.") return case '\n': l.errorf("Newline not permitted in a single-line comment.") return } if strings.HasPrefix(l.input[l.pos:], "#}") { l.pos += 2 // pass '#}' l.col += 2 break } l.next() } l.ignore() // ignore whole comment // Comment skipped continue // next token } if strings.HasPrefix(l.input[l.pos:], "{{") || // variable strings.HasPrefix(l.input[l.pos:], "{%") { // tag if l.pos > l.start { l.emit(TokenHTML) } l.tokenize() if l.errored { return } continue } } switch l.peek() { case '\n': l.line++ l.col = 0 } if l.next() == EOF { break } } if l.pos > l.start { l.emit(TokenHTML) } if l.inVerbatim { l.errorf("verbatim-tag not closed, got EOF.") } } func (l *lexer) tokenize() { for state := l.stateCode; state != nil; { state = state() } } func (l *lexer) stateCode() lexerStateFn { outer_loop: for { switch { case l.accept(tokenSpaceChars): if l.value() == "\n" { return l.errorf("Newline not allowed within tag/variable.") } l.ignore() continue case l.accept(tokenIdentifierChars): return l.stateIdentifier case l.accept(tokenDigits): return l.stateNumber case l.accept(`"'`): return l.stateString } // Check for symbol for _, sym := range TokenSymbols { if strings.HasPrefix(l.input[l.start:], sym) { l.pos += len(sym) l.col += l.length() l.emit(TokenSymbol) if sym == "%}" || sym == "-%}" || sym == "}}" || sym == "-}}" { // Tag/variable end, return after emit return nil } continue outer_loop } } break } // Normal shut down return nil } func (l *lexer) stateIdentifier() lexerStateFn { l.acceptRun(tokenIdentifierChars) l.acceptRun(tokenIdentifierCharsWithDigits) for _, kw := range TokenKeywords { if kw == l.value() { l.emit(TokenKeyword) return l.stateCode } } l.emit(TokenIdentifier) return l.stateCode } func (l *lexer) stateNumber() lexerStateFn { l.acceptRun(tokenDigits) if l.accept(tokenIdentifierCharsWithDigits) { // This seems to be an identifier starting with a number. // See https://github.com/flosch/pongo2/issues/151 return l.stateIdentifier() } /* Maybe context-sensitive number lexing? * comments.0.Text // first comment * usercomments.1.0 // second user, first comment * if (score >= 8.5) // 8.5 as a number if l.peek() == '.' { l.accept(".") if !l.accept(tokenDigits) { return l.errorf("Malformed number.") } l.acceptRun(tokenDigits) } */ l.emit(TokenNumber) return l.stateCode } func (l *lexer) stateString() lexerStateFn { quotationMark := l.value() l.ignore() l.startcol-- // we're starting the position at the first " for !l.accept(quotationMark) { switch l.next() { case '\\': // escape sequence switch l.peek() { case '"', '\\': l.next() default: return l.errorf("Unknown escape sequence: \\%c", l.peek()) } case EOF: return l.errorf("Unexpected EOF, string not closed.") case '\n': return l.errorf("Newline in string is not allowed.") } } l.backup() l.emit(TokenString) l.next() l.ignore() return l.stateCode }