Lexing Python into tokens for a context-free grammar
By lexicalscope.dev at
Every parser I've written was for a context-free grammar, and all use a recursive descent parser. I never thought about how I would parse a context-sensitive language like Yaml or Python until a Tweet recently sparked my interest.
A trick to parse indentation-based grammar such as YAML. They are not in CFG, so you can't use usual techniques such as recursive descendent. But if you make the lexer context-aware and emit INDENT and DEDENT tokens for indentation changes, they become CFG.
This post demonstrates a very basic lexer for a Python-like context-sensitive language. The goal is to create an example lexer that only emits the INDENT and DEDENT tokens where code blocks are opened or closed, making the language context-free and ready to parse with a context-free parser.
Python's documentation on Lexical analysis, the following describes how to handle indentation using INDENT and DEDENT tokens.
Before the first line of the file is read, a single zero is pushed on the stack; this will never be popped off again. The numbers pushed on the stack will always be strictly increasing from bottom to top. At the beginning of each logical line, the line’s indentation level is compared to the top of the stack. If it is equal, nothing happens. If it is larger, it is pushed on the stack, and one INDENT token is generated. If it is smaller, it must be one of the numbers occurring on the stack; all numbers on the stack that are larger are popped off, and for each number popped off a DEDENT token is generated. At the end of the file, a DEDENT token is generated for each number remaining on the stack that is larger than zero.
Python uses a PEG parser generator. I'll be writing the lexer manually. At a high level, we'll follow the lexical analysis guide above, with one critical caveat:
We will only create a new INDENT token if the previous two tokens are
the start of a new block, :\n
. In regex this looks like:
\:\s+\n
.
Using this slightly modified algorithm, we end up with the following tokens from a simple example:
def foobar(a):
print("bar")
foobar(11)
'def'
'foobar'
'('
'a'
')'
':'
'\n'
'<indent>'
'print'
'('
'bar'
')'
'\n'
'<dedent>'
'\n'
'foobar'
'('
'11'
')'
'\n'
At this point, each INDENT token corresponds to the start of a new
block, and the DEDENT tokens end it. In my lexer, the beginning of
each new line is signaled by the column index equaling zero. If the
column index equals zero, a call to the makeIndentToken
method is
made, which creates the INDENT/DEDENT tokens.
The following snippet from the token
function is where the column
index is checked.
func (l *Lexer) token() (*Token, error) {
r := l.peek()
if r == '\000' {
return nil, io.EOF
}
if l.col == 0 {
err := l.makeIndentToken()
if err != nil {
return nil, err
}
}
...
func (l *Lexer) makeIndentToken() error {
count := 0
for {
r := l.peek()
if r != ' ' {
break
}
l.read()
count++
}
if count == 0 {
for i := 0; i < len(l.indent)-1; i++ {
l.indent = l.indent[1:]
l.tokens = append(l.tokens, &Token{Value: "<dedent>", Type: DEDENT})
}
return nil
}
if count > l.indent[0] {
// only add indent token if the last two tokens where ":\n"
num := len(l.tokens)
if num > 1 && l.tokens[num-1].Type == NEWLINE && l.tokens[num-2].Type == COLON {
l.indent = append([]int{count}, l.indent...)
l.tokens = append(l.tokens, &Token{Value: "<indent>", Type: INDENT})
return nil
} else {
return errors.New(fmt.Sprintf("indentation error at %v:%v", l.row, l.col))
}
}
if count < l.indent[0] {
l.indent = l.indent[1:]
for i := 0; i < len(l.indent); i++ {
if l.indent[i] == count {
return nil
}
l.indent = l.indent[1:]
l.tokens = append(l.tokens, &Token{Value: "<dedent>", Type: DEDENT})
}
return errors.New(fmt.Sprintf("indentation error at %v:%v", l.row, l.col))
}
return nil
}
Line 15 If the indent is 0, we're back to the first
block. Pop all the values from the indent
stack and add DEDENT tokens
for each.
Line 24 If the current indent is greater than the latest indent, this signals a new block. However, this is where the minor hack exists. Only create the new block if the preceding two characters match a Python block definition (as far as I know). Throw an indentation error otherwise.
Line 37 This is identical to the lexical documentation. Add a DEDENT token to each item on the stack where no match exists. Keep popping off of the stack until an exact match is found. If no exact match is found, throw an indentation error.
The code in full is below:
// $ROOT/cmd/main.go
package main
import (
"flag"
"fmt"
"io/ioutil"
"os"
"github.com/kgwinnup/go-lex-space/internal/lexer"
)
func main() {
flag.Parse()
input := ""
if len(flag.Args()) > 0 {
bs, err := ioutil.ReadFile(flag.Args()[0])
if err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
input = string(bs)
fmt.Println(input)
} else {
bs, err := ioutil.ReadAll(os.Stdin)
if err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
input = string(bs)
}
lexer := lexer.New(input)
toks, err := lexer.Scan()
if err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
for _, tok := range toks {
fmt.Printf("'%v'\n", tok.Value)
}
}
// $ROOT/internal/lexer/lexer.go
package lexer
import (
"errors"
"fmt"
"io"
"strings"
"unicode"
)
type Token struct {
Value string
Type int
}
const (
DEF = iota
IDENTITY
IF
ELSE
COLON
NEWLINE
INDENT
DEDENT
STRING
INTEGER
LPAREN
RPAREN
LBRACE
RBRACE
GT
)
var keywords = map[string]int{
"if": IF,
"else": ELSE,
"def": DEF,
}
type Lexer struct {
input []rune
tokens []*Token
index int
indent []int
col int
row int
}
func New(input string) *Lexer {
indent := make([]int, 0)
indent = append(indent, 0)
return &Lexer{
input: []rune(input),
tokens: make([]*Token, 0),
index: 0,
indent: indent,
col: 0,
}
}
func (l *Lexer) read() (rune, error) {
if l.index < len(l.input) {
temp := l.input[l.index]
if temp == '\n' {
l.col = 0
l.row++
} else {
l.col++
}
l.index++
return temp, nil
}
return '\000', io.EOF
}
func (l *Lexer) peek() rune {
if l.index < len(l.input) {
return l.input[l.index]
}
return '\000'
}
func (l *Lexer) Scan() ([]*Token, error) {
for {
tok, err := l.token()
if err == io.EOF {
break
}
if err != nil {
return nil, err
}
l.tokens = append(l.tokens, tok)
}
return l.makeCFG(), nil
}
func (l *Lexer) makeCFG() []*Token {
out := make([]*Token, 0)
for _, tok := range l.tokens {
if tok.Type == INDENT {
out = append(out, &Token{Value: "{", Type: LBRACE})
continue
}
if tok.Type == DEDENT {
out = append(out, &Token{Value: "}", Type: RBRACE})
continue
}
out = append(out, tok)
}
return out
}
func (l *Lexer) makeIndentToken() error {
count := 0
for {
r := l.peek()
if r != ' ' {
break
}
l.read()
count++
}
if count == 0 {
for i := 0; i < len(l.indent)-1; i++ {
l.indent = l.indent[1:]
l.tokens = append(l.tokens, &Token{Value: "<dedent>", Type: DEDENT})
}
return nil
}
if count > l.indent[0] {
// only add indent token if the last two tokens where ":\n"
num := len(l.tokens)
if num > 1 && l.tokens[num-1].Type == NEWLINE && l.tokens[num-2].Type == COLON {
l.indent = append([]int{count}, l.indent...)
l.tokens = append(l.tokens, &Token{Value: "<indent>", Type: INDENT})
return nil
} else {
return errors.New(fmt.Sprintf("indentation error at %v:%v", l.row, l.col))
}
}
if count < l.indent[0] {
l.indent = l.indent[1:]
for i := 0; i < len(l.indent); i++ {
if l.indent[i] == count {
return nil
}
l.indent = l.indent[1:]
l.tokens = append(l.tokens, &Token{Value: "<dedent>", Type: DEDENT})
}
return errors.New(fmt.Sprintf("indentation error at %v:%v", l.row, l.col))
}
return nil
}
func (l *Lexer) token() (*Token, error) {
r := l.peek()
if r == '\000' {
return nil, io.EOF
}
if l.col == 0 {
err := l.makeIndentToken()
if err != nil {
return nil, err
}
}
for {
r = l.peek()
if r == ' ' {
l.read()
continue
}
break
}
switch r {
case '\n':
l.read()
return &Token{Value: "\\n", Type: NEWLINE}, nil
case ':':
l.read()
return &Token{Value: ":", Type: COLON}, nil
case '>':
l.read()
return &Token{Value: ">", Type: GT}, nil
case '(':
l.read()
return &Token{Value: "(", Type: LPAREN}, nil
case ')':
l.read()
return &Token{Value: ")", Type: RPAREN}, nil
case '"':
return l.readString()
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return l.readNumber()
default:
ident, err := l.readIdentity()
if err != nil {
return nil, err
}
if typ, ok := keywords[ident.Value]; ok {
return &Token{
Value: ident.Value,
Type: typ,
}, nil
}
return ident, nil
}
}
func (l *Lexer) readIdentity() (*Token, error) {
var builder strings.Builder
tok := l.peek()
if unicode.IsLetter(tok) {
r, err := l.read()
if err != nil {
return nil, err
}
builder.WriteRune(r)
}
for {
tok := l.peek()
if unicode.IsLetter(tok) || unicode.IsDigit(tok) {
r, err := l.read()
if err != nil {
return nil, err
}
builder.WriteRune(r)
continue
}
break
}
return &Token{
Value: builder.String(),
Type: IDENTITY,
}, nil
}
func (l *Lexer) readNumber() (*Token, error) {
var builder strings.Builder
for {
tok := l.peek()
if !unicode.IsDigit(tok) {
break
}
r, err := l.read()
if err != nil {
return nil, err
}
builder.WriteRune(r)
}
return &Token{
Value: builder.String(),
Type: INTEGER,
}, nil
}
func (l *Lexer) readString() (*Token, error) {
var builder strings.Builder
l.read() // initial quote
for {
tok := l.peek()
if tok == '\000' {
break
}
if tok == '"' {
l.read()
break
}
r, err := l.read()
if err != nil {
return nil, err
}
builder.WriteRune(r)
}
return &Token{
Value: builder.String(),
Type: STRING,
}, nil
}