# Kaileidoscope LLVM 编写语言中文教程

## 基本语言

```# 计算第x个数字
def fib(x)
if x < 3 then
1
else
fib(x-1)+fib(x-2)

# 计算第40个数字
fib(40)```

``````extern sin(arg);
extern cos(arg);
extern atan2(arg1 arg2);

atan2(sin(.4), cos(42))``````

## 词法分析

``````// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
// of these for known things.
enum Token {
tok_eof = -1,

// commands
tok_def = -2, tok_extern = -3,

// primary
tok_identifier = -4, tok_number = -5,
};

static std::string IdentifierStr;  // Filled in if tok_identifier
static double NumVal;              // Filled in if tok_number``````

``````/// gettok - Return the next token from standard input.
static int gettok() {
static int LastChar = ' ';

// Skip any whitespace.
while (isspace(LastChar))
LastChar = getchar();``````

`gettok`通过调用C语言的`getchar()`来读取标准输入流的字符，它读取字符后会将其保存在`LastChar`并剔除出输入流。首先要做的是忽略token之间的空白符。这个可以用下面的循环实现。

``````if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
IdentifierStr = LastChar;
while (isalnum((LastChar = getchar())))
IdentifierStr += LastChar;

}``````

``````if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
std::string NumStr;
do {
NumStr += LastChar;
LastChar = getchar();
} while (isdigit(LastChar) || LastChar == '.');

NumVal = strtod(NumStr.c_str(), 0);
}``````

``````if (LastChar == '#') {
// Comment until end of line.
do LastChar = getchar();
while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');

if (LastChar != EOF)
return gettok();
}``````

``````// Check for end of file.  Don't eat the EOF.
if (LastChar == EOF)

// Otherwise, just return the character as its ascii value.
int ThisChar = LastChar;
LastChar = getchar();
return ThisChar;
}``````

## 抽象语法树(AST)

``````/// ExprAST - Base class for all expression nodes.
class ExprAST {
public:
virtual ~ExprAST() {}
};

/// NumberExprAST - Expression class for numeric literals like "1.0".
class NumberExprAST : public ExprAST {
double Val;
public:
NumberExprAST(double val) : Val(val) {}
};``````

``````/// VariableExprAST - Expression class for referencing a variable, like "a".
class VariableExprAST : public ExprAST {
std::string Name;
public:
VariableExprAST(const std::string &name) : Name(name) {}
};

/// BinaryExprAST - Expression class for a binary operator.
class BinaryExprAST : public ExprAST {
char Op;
ExprAST *LHS, *RHS;
public:
BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs)
: Op(op), LHS(lhs), RHS(rhs) {}
};

/// CallExprAST - Expression class for function calls.
class CallExprAST : public ExprAST {
std::string Callee;
std::vector<ExprAST*> Args;
public:
CallExprAST(const std::string &callee, std::vector<ExprAST*> &args)
: Callee(callee), Args(args) {}
};``````

``````/// PrototypeAST - This class represents the "prototype" for a function,
/// which captures its name, and its argument names (thus implicitly the number
/// of arguments the function takes).
class PrototypeAST {
std::string Name;
std::vector<std::string> Args;
public:
PrototypeAST(const std::string &name, const std::vector<std::string> &args)
: Name(name), Args(args) {}
};

/// FunctionAST - This class represents a function definition itself.
class FunctionAST {
PrototypeAST *Proto;
ExprAST *Body;
public:
FunctionAST(PrototypeAST *proto, ExprAST *body)
: Proto(proto), Body(body) {}
};``````

## 解析基础

``````ExprAST *X = new VariableExprAST("x");
ExprAST *Y = new VariableExprAST("y");
ExprAST *Result = new BinaryExprAST('+', X, Y);``````

``````/// CurTok/getNextToken - Provide a simple token buffer.  CurTok is the current
/// token the parser is looking at.  getNextToken reads another token from the
/// lexer and updates CurTok with its results.
static int CurTok;
static int getNextToken() {
return CurTok = gettok();
}``````

``````/// Error* - These are little helper functions for error handling.
ExprAST *Error(const char *Str) { fprintf(stderr, "Error: %s\n", Str);return 0;}
PrototypeAST *ErrorP(const char *Str) { Error(Str); return 0; }
FunctionAST *ErrorF(const char *Str) { Error(Str); return 0; }``````

## 基本表达式解析

``````/// numberexpr ::= number
static ExprAST *ParseNumberExpr() {
ExprAST *Result = new NumberExprAST(NumVal);
getNextToken(); // consume the number
return Result;
}``````

``````/// parenexpr ::= '(' expression ')'
static ExprAST *ParseParenExpr() {
getNextToken();  // eat (.
ExprAST *V = ParseExpression();
if (!V) return 0;

if (CurTok != ')')
return Error("expected ')'");
getNextToken();  // eat ).
return V;
}``````

• 异常检测：当被调用时，这个函数会默认当前的token是`(`，但是当结束表达式解析后，有可能末尾的token就不是`)`。比如，如果用户错将`(4)`打成了`(4 *`，解析器就会检测到这个错误，为了提醒有错误发生，我们的解析器将返回NULL。
• 递归式解析：这段函数中调用了`ParseExpression`（我们将很快看到`ParseExpression`同样会调用`ParseParenExpr`）。这种方式相当强大，因为它允许我们处理嵌套的语法，同时也保持了每一个过程都是相当简洁。注意，括号并不会成为抽象语法树的组成部分，它的作用是将表达式组合起来引导引导解析器正确地处理它们。当建立好了抽象语法树后，它们便可以被抛弃了。

``````/// identifierexpr
///   ::= identifier
///   ::= identifier '(' expression* ')'
static ExprAST *ParseIdentifierExpr() {
std::string IdName = IdentifierStr;

getNextToken();  // eat identifier.

if (CurTok != '(') // Simple variable ref.
return new VariableExprAST(IdName);

// Call.
getNextToken();  // eat (
std::vector<ExprAST*> Args;
if (CurTok != ')') {
while (1) {
ExprAST *Arg = ParseExpression();
if (!Arg) return 0;
Args.push_back(Arg);

if (CurTok == ')') break;

if (CurTok != ',')
return Error("Expected ')' or ',' in argument list");
getNextToken();
}
}

// Eat the ')'.
getNextToken();

return new CallExprAST(IdName, Args);
}``````

``````/// primary
///   ::= identifierexpr
///   ::= numberexpr
///   ::= parenexpr
static ExprAST *ParsePrimary() {
switch (CurTok) {
default: return Error("unknown token when expecting an expression");
case tok_identifier: return ParseIdentifierExpr();
case tok_number:     return ParseNumberExpr();
case '(':            return ParseParenExpr();
}
}``````

## 二元表达式解析

``````/// BinopPrecedence - This holds the precedence for each binary operator that is
/// defined.
static std::map<char, int> BinopPrecedence;

/// GetTokPrecedence - Get the precedence of the pending binary operator token.
static int GetTokPrecedence() {
if (!isascii(CurTok))
return -1;

// Make sure it's a declared binop.
int TokPrec = BinopPrecedence[CurTok];
if (TokPrec <= 0) return -1;
}

int main() {
// Install standard binary operators.
// 1 is lowest precedence.
BinopPrecedence['<'] = 10;
BinopPrecedence['+'] = 20;
BinopPrecedence['-'] = 20;
BinopPrecedence['*'] = 40;  // highest.
...
}``````

``````/// expression
///   ::= primary binoprhs
///
static ExprAST *ParseExpression() {
ExprAST *LHS = ParsePrimary();
if (!LHS) return 0;

return ParseBinOpRHS(0, LHS);
}``````

`ParseBinOpRHS`是为我们解析*运算符-表达式*对的函数。它记录优先级和已解析部分的指针。

``````/// binoprhs
///   ::= ('+' primary)*
static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
// If this is a binop, find its precedence.
while (1) {
int TokPrec = GetTokPrecedence();

// If this is a binop that binds at least as tightly as the current binop,
// consume it, otherwise we are done.
if (TokPrec < ExprPrec)
return LHS;``````

``````// Okay, we know this is a binop.
int BinOp = CurTok;
getNextToken();  // eat binop

// Parse the primary expression after the binary operator.
ExprAST *RHS = ParsePrimary();
if (!RHS) return 0;``````

``````// If BinOp binds less tightly with RHS than the operator after RHS, let
// the pending operator take RHS as its LHS.
int NextPrec = GetTokPrecedence();
if (TokPrec < NextPrec) {``````

``````  ... if body omitted ...
}

// Merge LHS/RHS.
LHS = new BinaryExprAST(BinOp, LHS, RHS);
}  // loop around to the top of the while loop.
}``````

``````// If BinOp binds less tightly with RHS than the operator after RHS, let
// the pending operator take RHS as its LHS.
int NextPrec = GetTokPrecedence();
if (TokPrec < NextPrec) {
RHS = ParseBinOpRHS(TokPrec+1, RHS);
if (RHS == 0) return 0;
}
// Merge LHS/RHS.
LHS = new BinaryExprAST(BinOp, LHS, RHS);
}  // loop around to the top of the while loop.
}``````

## 其它解析

``````/// prototype
///   ::= id '(' id* ')'
static PrototypeAST *ParsePrototype() {
if (CurTok != tok_identifier)
return ErrorP("Expected function name in prototype");

std::string FnName = IdentifierStr;
getNextToken();

if (CurTok != '(')
return ErrorP("Expected '(' in prototype");

// Read the list of argument names.
std::vector<std::string> ArgNames;
while (getNextToken() == tok_identifier)
ArgNames.push_back(IdentifierStr);
if (CurTok != ')')
return ErrorP("Expected ')' in prototype");

// success.
getNextToken();  // eat ')'.

return new PrototypeAST(FnName, ArgNames);
}``````

``````/// definition ::= 'def' prototype expression
static FunctionAST *ParseDefinition() {
getNextToken();  // eat def.
PrototypeAST *Proto = ParsePrototype();
if (Proto == 0) return 0;

if (ExprAST *E = ParseExpression())
return new FunctionAST(Proto, E);
return 0;
}``````

``````/// external ::= 'extern' prototype
static PrototypeAST *ParseExtern() {
getNextToken();  // eat extern.
return ParsePrototype();
}``````

``````/// toplevelexpr ::= expression
static FunctionAST *ParseTopLevelExpr() {
if (ExprAST *E = ParseExpression()) {
// Make an anonymous proto.
PrototypeAST *Proto = new PrototypeAST("", std::vector<std::string>());
return new FunctionAST(Proto, E);
}
return 0;
}``````

## 驱动代码

``````/// top ::= definition | external | expression | ';'
static void MainLoop() {
while (1) {
switch (CurTok) {
case tok_eof:    return;
case ';':        getNextToken(); break;  // ignore top-level semicolons.
case tok_def:    HandleDefinition(); break;
case tok_extern: HandleExtern(); break;
default:         HandleTopLevelExpression(); break;
}
}
}``````

```\$ ./a.out
ready> def foo(x y) x+foo(y, 4.0);
Parsed a function definition.
ready> def foo(x y) x+y y;
Parsed a function definition.
Parsed a top-level expr
ready> def foo(x y) x+y );
Parsed a function definition.
Error: unknown token when expecting an expression
\$```

## 完整代码

```# Compile
clang++ -g -O3 toy.cpp
# Run
./a.out```

``````#include <cctype>
#include <cstdio>
#include <cstdlib>
#include <map>
#include <string>
#include <vector>

//===----------------------------------------------------------------------===//
// Lexer
//===----------------------------------------------------------------------===//

// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
// of these for known things.
enum Token {
tok_eof = -1,

// commands
tok_def = -2, tok_extern = -3,

// primary
tok_identifier = -4, tok_number = -5
};

static std::string IdentifierStr;  // Filled in if tok_identifier
static double NumVal;              // Filled in if tok_number

/// gettok - Return the next token from standard input.
static int gettok() {
static int LastChar = ' ';

// Skip any whitespace.
while (isspace(LastChar))
LastChar = getchar();

if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
IdentifierStr = LastChar;
while (isalnum((LastChar = getchar())))
IdentifierStr += LastChar;

}

if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
std::string NumStr;
do {
NumStr += LastChar;
LastChar = getchar();
} while (isdigit(LastChar) || LastChar == '.');

NumVal = strtod(NumStr.c_str(), 0);
}

if (LastChar == '#') {
// Comment until end of line.
do LastChar = getchar();
while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');

if (LastChar != EOF)
return gettok();
}

// Check for end of file.  Don't eat the EOF.
if (LastChar == EOF)

// Otherwise, just return the character as its ascii value.
int ThisChar = LastChar;
LastChar = getchar();
return ThisChar;
}

//===----------------------------------------------------------------------===//
// Abstract Syntax Tree (aka Parse Tree)
//===----------------------------------------------------------------------===//
namespace {
/// ExprAST - Base class for all expression nodes.
class ExprAST {
public:
virtual ~ExprAST() {}
};

/// NumberExprAST - Expression class for numeric literals like "1.0".
class NumberExprAST : public ExprAST {
public:
NumberExprAST(double val) {}
};

/// VariableExprAST - Expression class for referencing a variable, like "a".
class VariableExprAST : public ExprAST {
std::string Name;
public:
VariableExprAST(const std::string &name) : Name(name) {}
};

/// BinaryExprAST - Expression class for a binary operator.
class BinaryExprAST : public ExprAST {
public:
BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs) {}
};

/// CallExprAST - Expression class for function calls.
class CallExprAST : public ExprAST {
std::string Callee;
std::vector<ExprAST*> Args;
public:
CallExprAST(const std::string &callee, std::vector<ExprAST*> &args)
: Callee(callee), Args(args) {}
};

/// PrototypeAST - This class represents the "prototype" for a function,
/// which captures its name, and its argument names (thus implicitly the number
/// of arguments the function takes).
class PrototypeAST {
std::string Name;
std::vector<std::string> Args;
public:
PrototypeAST(const std::string &name, const std::vector<std::string> &args)
: Name(name), Args(args) {}

};

/// FunctionAST - This class represents a function definition itself.
class FunctionAST {
public:
FunctionAST(PrototypeAST *proto, ExprAST *body) {}
};
} // end anonymous namespace

//===----------------------------------------------------------------------===//
// Parser
//===----------------------------------------------------------------------===//

/// CurTok/getNextToken - Provide a simple token buffer.  CurTok is the current
/// token the parser is looking at.  getNextToken reads another token from the
/// lexer and updates CurTok with its results.
static int CurTok;
static int getNextToken() {
return CurTok = gettok();
}

/// BinopPrecedence - This holds the precedence for each binary operator that is
/// defined.
static std::map<char, int> BinopPrecedence;

/// GetTokPrecedence - Get the precedence of the pending binary operator token.
static int GetTokPrecedence() {
if (!isascii(CurTok))
return -1;

// Make sure it's a declared binop.
int TokPrec = BinopPrecedence[CurTok];
if (TokPrec <= 0) return -1;
}

/// Error* - These are little helper functions for error handling.
ExprAST *Error(const char *Str) { fprintf(stderr, "Error: %s\n", Str);return 0;}
PrototypeAST *ErrorP(const char *Str) { Error(Str); return 0; }

static ExprAST *ParseExpression();

/// identifierexpr
///   ::= identifier
///   ::= identifier '(' expression* ')'
static ExprAST *ParseIdentifierExpr() {
std::string IdName = IdentifierStr;

getNextToken();  // eat identifier.

if (CurTok != '(') // Simple variable ref.
return new VariableExprAST(IdName);

// Call.
getNextToken();  // eat (
std::vector<ExprAST*> Args;
if (CurTok != ')') {
while (1) {
ExprAST *Arg = ParseExpression();
if (!Arg) return 0;
Args.push_back(Arg);

if (CurTok == ')') break;

if (CurTok != ',')
return Error("Expected ')' or ',' in argument list");
getNextToken();
}
}

// Eat the ')'.
getNextToken();

return new CallExprAST(IdName, Args);
}

/// numberexpr ::= number
static ExprAST *ParseNumberExpr() {
ExprAST *Result = new NumberExprAST(NumVal);
getNextToken(); // consume the number
return Result;
}

/// parenexpr ::= '(' expression ')'
static ExprAST *ParseParenExpr() {
getNextToken();  // eat (.
ExprAST *V = ParseExpression();
if (!V) return 0;

if (CurTok != ')')
return Error("expected ')'");
getNextToken();  // eat ).
return V;
}

/// primary
///   ::= identifierexpr
///   ::= numberexpr
///   ::= parenexpr
static ExprAST *ParsePrimary() {
switch (CurTok) {
default: return Error("unknown token when expecting an expression");
case tok_identifier: return ParseIdentifierExpr();
case tok_number:     return ParseNumberExpr();
case '(':            return ParseParenExpr();
}
}

/// binoprhs
///   ::= ('+' primary)*
static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
// If this is a binop, find its precedence.
while (1) {
int TokPrec = GetTokPrecedence();

// If this is a binop that binds at least as tightly as the current binop,
// consume it, otherwise we are done.
if (TokPrec < ExprPrec)
return LHS;

// Okay, we know this is a binop.
int BinOp = CurTok;
getNextToken();  // eat binop

// Parse the primary expression after the binary operator.
ExprAST *RHS = ParsePrimary();
if (!RHS) return 0;

// If BinOp binds less tightly with RHS than the operator after RHS, let
// the pending operator take RHS as its LHS.
int NextPrec = GetTokPrecedence();
if (TokPrec < NextPrec) {
RHS = ParseBinOpRHS(TokPrec+1, RHS);
if (RHS == 0) return 0;
}

// Merge LHS/RHS.
LHS = new BinaryExprAST(BinOp, LHS, RHS);
}
}

/// expression
///   ::= primary binoprhs
///
static ExprAST *ParseExpression() {
ExprAST *LHS = ParsePrimary();
if (!LHS) return 0;

return ParseBinOpRHS(0, LHS);
}

/// prototype
///   ::= id '(' id* ')'
static PrototypeAST *ParsePrototype() {
if (CurTok != tok_identifier)
return ErrorP("Expected function name in prototype");

std::string FnName = IdentifierStr;
getNextToken();

if (CurTok != '(')
return ErrorP("Expected '(' in prototype");

std::vector<std::string> ArgNames;
while (getNextToken() == tok_identifier)
ArgNames.push_back(IdentifierStr);
if (CurTok != ')')
return ErrorP("Expected ')' in prototype");

// success.
getNextToken();  // eat ')'.

return new PrototypeAST(FnName, ArgNames);
}

/// definition ::= 'def' prototype expression
static FunctionAST *ParseDefinition() {
getNextToken();  // eat def.
PrototypeAST *Proto = ParsePrototype();
if (Proto == 0) return 0;

if (ExprAST *E = ParseExpression())
return new FunctionAST(Proto, E);
return 0;
}

/// toplevelexpr ::= expression
static FunctionAST *ParseTopLevelExpr() {
if (ExprAST *E = ParseExpression()) {
// Make an anonymous proto.
PrototypeAST *Proto = new PrototypeAST("", std::vector<std::string>());
return new FunctionAST(Proto, E);
}
return 0;
}

/// external ::= 'extern' prototype
static PrototypeAST *ParseExtern() {
getNextToken();  // eat extern.
return ParsePrototype();
}

//===----------------------------------------------------------------------===//
// Top-Level parsing
//===----------------------------------------------------------------------===//

static void HandleDefinition() {
if (ParseDefinition()) {
fprintf(stderr, "Parsed a function definition.\n");
} else {
getNextToken();
}
}

static void HandleExtern() {
if (ParseExtern()) {
fprintf(stderr, "Parsed an extern\n");
} else {
getNextToken();
}
}

static void HandleTopLevelExpression() {
// Evaluate a top-level expression into an anonymous function.
if (ParseTopLevelExpr()) {
fprintf(stderr, "Parsed a top-level expr\n");
} else {
getNextToken();
}
}

/// top ::= definition | external | expression | ';'
static void MainLoop() {
while (1) {
switch (CurTok) {
case tok_eof:    return;
case ';':        getNextToken(); break;  // ignore top-level semicolons.
case tok_def:    HandleDefinition(); break;
case tok_extern: HandleExtern(); break;
default:         HandleTopLevelExpression(); break;
}
}
}

//===----------------------------------------------------------------------===//
// Main driver code.
//===----------------------------------------------------------------------===//

int main() {
// Install standard binary operators.
// 1 is lowest precedence.
BinopPrecedence['<'] = 10;
BinopPrecedence['+'] = 20;
BinopPrecedence['-'] = 20;
BinopPrecedence['*'] = 40;  // highest.

// Prime the first token.
getNextToken();

// Run the main "interpreter loop" now.
MainLoop();

return 0;
}``````

## 中间码生成配置

``````/// ExprAST - Base class for all expression nodes.
class ExprAST {
public:
virtual ~ExprAST() {}
virtual Value *Codegen() = 0;
};

/// NumberExprAST - Expression class for numeric literals like "1.0".
class NumberExprAST : public ExprAST {
double Val;
public:
NumberExprAST(double val) : Val(val) {}
virtual Value *Codegen();
};``````

Codegen() 运行后会生成中间码以及其它运行时需要的信息，这些信息以 LLVM value 对象形式返回。Value 类用来表示LLVM中的“静态单赋值寄存器（Static Single Assignment register）”或者 SSA value。SSA值的特点是，它在经过相关指令计算出，并不能被改变（除非程序从头来过）。换句话说，SSA是个常量。你想了解SSA更多的话，请阅读 静态单赋值，一旦你了解它，你会发现这相当简单。

``````Value *ErrorV(const char *Str) { Error(Str); return 0; }

static Module *TheModule;
static IRBuilder<> Builder(getGlobalContext());
static std::map<std::string, Value*> NamedValues;``````

`Builder` 是一个辅助对象，用来为生成LLVM指令提供方便。它是 `IRBuilder` 类的实例，用来标记当前位置以插入新的指令。

`NameValues` 键值表保存了当前的代码范围内定义的值，和记录并表示这些值的LLVM对象（换句话说，这就是当前代码的符号表）。在这种形式下，唯一可以参考的是函数参数（In this form of Kaleidoscope, the only things that can be referenced are function parameters. ）。因此，当生成函数体代码时，函数参数会被记录到这个表里去。

## 表达式代码生成

``````Value *NumberExprAST::Codegen() {
return ConstantFP::get(getGlobalContext(), APFloat(Val));
}``````

``````Value *VariableExprAST::Codegen() {
// Look this variable up in the function.
Value *V = NamedValues[Name];
return V ? V : ErrorV("Unknown variable name");
}``````

``````Value *BinaryExprAST::Codegen() {
Value *L = LHS->Codegen();
Value *R = RHS->Codegen();
if (L == 0 || R == 0) return 0;

switch (Op) {
case '-': return Builder.CreateFSub(L, R, "subtmp");
case '*': return Builder.CreateFMul(L, R, "multmp");
case '<':
L = Builder.CreateFCmpULT(L, R, "cmptmp");
// Convert bool 0/1 to double 0.0 or 1.0
return Builder.CreateUIToFP(L, Type::getDoubleTy(getGlobalContext()),
"booltmp");
default: return ErrorV("invalid binary operator");
}
}``````

JSmiles

2583 文章
29 评论
84935 人气