Lexing Python into tokens for a context-free grammar

Every parser I've written was for a context-free grammar, and all use a recursive descent parser. I never thought about how I would parse a context-sensitive language like Yaml or Python until a Tweet recently sparked my interest.

A trick to parse indentation-based grammar such as YAML. They are not in CFG, so you can't use usual techniques such as recursive descendent. But if you make the lexer context-aware and emit INDENT and DEDENT tokens for indentation changes, they become CFG.

This post demonstrates a very basic lexer for a Python-like context-sensitive language. The goal is to create an example lexer that only emits the INDENT and DEDENT tokens where code blocks are opened or closed, making the language context-free and ready to parse with a context-free parser.

Python's documentation on Lexical analysis, the following describes how to handle indentation using INDENT and DEDENT tokens.

Before the first line of the file is read, a single zero is pushed on the stack; this will never be popped off again. The numbers pushed on the stack will always be strictly increasing from bottom to top. At the beginning of each logical line, the line’s indentation level is compared to the top of the stack. If it is equal, nothing happens. If it is larger, it is pushed on the stack, and one INDENT token is generated. If it is smaller, it must be one of the numbers occurring on the stack; all numbers on the stack that are larger are popped off, and for each number popped off a DEDENT token is generated. At the end of the file, a DEDENT token is generated for each number remaining on the stack that is larger than zero.

Python uses a PEG parser generator. I'll be writing the lexer manually. At a high level, we'll follow the lexical analysis guide above, with one critical caveat:

We will only create a new INDENT token if the previous two tokens are the start of a new block, :\n. In regex this looks like: \:\s+\n.

Using this slightly modified algorithm, we end up with the following tokens from a simple example:

def foobar(a):
    print("bar")

foobar(11)

'def'
'foobar'
'('
'a'
')'
':'
'\n'
'<indent>'
'print'
'('
'bar'
')'
'\n'
'<dedent>'
'\n'
'foobar'
'('
'11'
')'
'\n'

At this point, each INDENT token corresponds to the start of a new block, and the DEDENT tokens end it. In my lexer, the beginning of each new line is signaled by the column index equaling zero. If the column index equals zero, a call to the makeIndentToken method is made, which creates the INDENT/DEDENT tokens.

The following snippet from the token function is where the column index is checked.

func (l *Lexer) token() (*Token, error) {

    r := l.peek()

    if r == '\000' {
        return nil, io.EOF
    }

    if l.col == 0 {
        err := l.makeIndentToken()
        if err != nil {
            return nil, err
        }
    }
...

func (l *Lexer) makeIndentToken() error {
    count := 0

    for {
        r := l.peek()

        if r != ' ' {
            break
        }

        l.read()
        count++
    }

    if count == 0 {
        for i := 0; i < len(l.indent)-1; i++ {
            l.indent = l.indent[1:]
            l.tokens = append(l.tokens, &Token{Value: "<dedent>", Type: DEDENT})
        }

        return nil
    }

    if count > l.indent[0] {

        // only add indent token if the last two tokens where ":\n"
        num := len(l.tokens)
        if num > 1 && l.tokens[num-1].Type == NEWLINE && l.tokens[num-2].Type == COLON {
            l.indent = append([]int{count}, l.indent...)
            l.tokens = append(l.tokens, &Token{Value: "<indent>", Type: INDENT})
            return nil
        } else {
            return errors.New(fmt.Sprintf("indentation error at %v:%v", l.row, l.col))
        }
    }

    if count < l.indent[0] {
        l.indent = l.indent[1:]

        for i := 0; i < len(l.indent); i++ {
            if l.indent[i] == count {
                return nil
            }

            l.indent = l.indent[1:]
            l.tokens = append(l.tokens, &Token{Value: "<dedent>", Type: DEDENT})
        }

        return errors.New(fmt.Sprintf("indentation error at %v:%v", l.row, l.col))
    }

    return nil
}

Line 15 If the indent is 0, we're back to the first block. Pop all the values from the indent stack and add DEDENT tokens for each.

Line 24 If the current indent is greater than the latest indent, this signals a new block. However, this is where the minor hack exists. Only create the new block if the preceding two characters match a Python block definition (as far as I know). Throw an indentation error otherwise.

Line 37 This is identical to the lexical documentation. Add a DEDENT token to each item on the stack where no match exists. Keep popping off of the stack until an exact match is found. If no exact match is found, throw an indentation error.

The code in full is below:

// $ROOT/cmd/main.go

package main

import (
    "flag"
    "fmt"
    "io/ioutil"
    "os"

    "github.com/kgwinnup/go-lex-space/internal/lexer"
)

func main() {
    flag.Parse()

    input := ""

    if len(flag.Args()) > 0 {
        bs, err := ioutil.ReadFile(flag.Args()[0])
        if err != nil {
            fmt.Fprintf(os.Stderr, "%v\n", err)
            os.Exit(1)
        }

        input = string(bs)

        fmt.Println(input)
    } else {
        bs, err := ioutil.ReadAll(os.Stdin)
        if err != nil {
            fmt.Fprintf(os.Stderr, "%v\n", err)
            os.Exit(1)
        }

        input = string(bs)
    }

    lexer := lexer.New(input)
    toks, err := lexer.Scan()
    if err != nil {
        fmt.Fprintf(os.Stderr, "%v\n", err)
        os.Exit(1)
    }

    for _, tok := range toks {
        fmt.Printf("'%v'\n", tok.Value)
    }

}

// $ROOT/internal/lexer/lexer.go

package lexer

import (
    "errors"
    "fmt"
    "io"
    "strings"
    "unicode"
)

type Token struct {
    Value string
    Type  int
}

const (
    DEF = iota
    IDENTITY
    IF
    ELSE
    COLON
    NEWLINE
    INDENT
    DEDENT
    STRING
    INTEGER
    LPAREN
    RPAREN
    LBRACE
    RBRACE
    GT
)

var keywords = map[string]int{
    "if":   IF,
    "else": ELSE,
    "def":  DEF,
}

type Lexer struct {
    input  []rune
    tokens []*Token
    index  int
    indent []int
    col    int
    row    int
}

func New(input string) *Lexer {
    indent := make([]int, 0)
    indent = append(indent, 0)

    return &Lexer{
        input:  []rune(input),
        tokens: make([]*Token, 0),
        index:  0,
        indent: indent,
        col:    0,
    }
}

func (l *Lexer) read() (rune, error) {
    if l.index < len(l.input) {
        temp := l.input[l.index]

        if temp == '\n' {
            l.col = 0
            l.row++
        } else {
            l.col++
        }

        l.index++
        return temp, nil
    }

    return '\000', io.EOF
}

func (l *Lexer) peek() rune {
    if l.index < len(l.input) {
        return l.input[l.index]
    }

    return '\000'
}

func (l *Lexer) Scan() ([]*Token, error) {
    for {
        tok, err := l.token()
        if err == io.EOF {
            break
        }

        if err != nil {
            return nil, err
        }

        l.tokens = append(l.tokens, tok)
    }

    return l.makeCFG(), nil
}

func (l *Lexer) makeCFG() []*Token {
    out := make([]*Token, 0)

    for _, tok := range l.tokens {
        if tok.Type == INDENT {
            out = append(out, &Token{Value: "{", Type: LBRACE})
            continue
        }

        if tok.Type == DEDENT {
            out = append(out, &Token{Value: "}", Type: RBRACE})
            continue
        }

        out = append(out, tok)

    }

    return out
}

func (l *Lexer) makeIndentToken() error {
    count := 0

    for {
        r := l.peek()

        if r != ' ' {
            break
        }

        l.read()
        count++
    }

    if count == 0 {
        for i := 0; i < len(l.indent)-1; i++ {
            l.indent = l.indent[1:]
            l.tokens = append(l.tokens, &Token{Value: "<dedent>", Type: DEDENT})
        }

        return nil
    }

    if count > l.indent[0] {

        // only add indent token if the last two tokens where ":\n"
        num := len(l.tokens)
        if num > 1 && l.tokens[num-1].Type == NEWLINE && l.tokens[num-2].Type == COLON {
            l.indent = append([]int{count}, l.indent...)
            l.tokens = append(l.tokens, &Token{Value: "<indent>", Type: INDENT})
            return nil
        } else {
            return errors.New(fmt.Sprintf("indentation error at %v:%v", l.row, l.col))
        }
    }

    if count < l.indent[0] {
        l.indent = l.indent[1:]

        for i := 0; i < len(l.indent); i++ {
            if l.indent[i] == count {
                return nil
            }

            l.indent = l.indent[1:]
            l.tokens = append(l.tokens, &Token{Value: "<dedent>", Type: DEDENT})
        }

        return errors.New(fmt.Sprintf("indentation error at %v:%v", l.row, l.col))
    }

    return nil
}

func (l *Lexer) token() (*Token, error) {

    r := l.peek()

    if r == '\000' {
        return nil, io.EOF
    }

    if l.col == 0 {
        err := l.makeIndentToken()
        if err != nil {
            return nil, err
        }
    }

    for {
        r = l.peek()
        if r == ' ' {
            l.read()
            continue
        }

        break
    }

    switch r {
    case '\n':
        l.read()
        return &Token{Value: "\\n", Type: NEWLINE}, nil

    case ':':
        l.read()
        return &Token{Value: ":", Type: COLON}, nil

    case '>':
        l.read()
        return &Token{Value: ">", Type: GT}, nil

    case '(':
        l.read()
        return &Token{Value: "(", Type: LPAREN}, nil

    case ')':
        l.read()
        return &Token{Value: ")", Type: RPAREN}, nil

    case '"':
        return l.readString()

    case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
        return l.readNumber()

    default:
        ident, err := l.readIdentity()
        if err != nil {
            return nil, err
        }

        if typ, ok := keywords[ident.Value]; ok {
            return &Token{
                Value: ident.Value,
                Type:  typ,
            }, nil
        }

        return ident, nil
    }

}

func (l *Lexer) readIdentity() (*Token, error) {
    var builder strings.Builder

    tok := l.peek()

    if unicode.IsLetter(tok) {
        r, err := l.read()
        if err != nil {
            return nil, err
        }

        builder.WriteRune(r)
    }

    for {
        tok := l.peek()

        if unicode.IsLetter(tok) || unicode.IsDigit(tok) {
            r, err := l.read()
            if err != nil {
                return nil, err
            }

            builder.WriteRune(r)
            continue
        }

        break
    }

    return &Token{
        Value: builder.String(),
        Type:  IDENTITY,
    }, nil
}

func (l *Lexer) readNumber() (*Token, error) {
    var builder strings.Builder

    for {
        tok := l.peek()

        if !unicode.IsDigit(tok) {
            break
        }

        r, err := l.read()
        if err != nil {
            return nil, err
        }

        builder.WriteRune(r)
    }

    return &Token{
        Value: builder.String(),
        Type:  INTEGER,
    }, nil
}

func (l *Lexer) readString() (*Token, error) {
    var builder strings.Builder

    l.read() // initial quote

    for {
        tok := l.peek()

        if tok == '\000' {
            break
        }

        if tok == '"' {
            l.read()
            break
        }

        r, err := l.read()
        if err != nil {
            return nil, err
        }

        builder.WriteRune(r)
    }

    return &Token{
        Value: builder.String(),
        Type:  STRING,
    }, nil
}