Code Generation: Transforming ASTs into Target Code
Code generation is the final phase of the compilation process, where the compiler translates the optimized intermediate representation (typically an AST) into the target language, which could be assembly code, machine code, or another programming language.
What is Code Generation?
Code generation refers to the process of converting the internal representation of the program (usually an AST) into a form that can be executed by a computer. This involves mapping high-level language constructs to low-level instructions or code.
Target Languages
Code generation can target various outputs:
Code Generation Phases
The code generation process typically involves:
- Instruction selection
- Register allocation
- Instruction scheduling
- Code emission
Implementing a Simple Code Generator in JavaScript
Let's build a code generator that translates our AST into JavaScript code:
class CodeGenerator {
constructor() {
this.code = '';
this.indentLevel = 0;
}
indent() {
return ' '.repeat(this.indentLevel * 2);
}
visitProgram(node) {
for (const statement of node.body) {
this.code += this.indent();
this.visit(statement);
if (!this.code.endsWith('\n')) {
this.code += '\n';
}
}
return this.code;
}
visitBinaryExpression(node) {
let code = '(';
code += this.visit(node.left);
code += ` ${node.operator} `;
code += this.visit(node.right);
code += ')';
return code;
}
visitVariableDeclaration(node) {
let code = `${node.kind} ${node.name}`;
if (node.initializer) {
code += ` = ${this.visit(node.initializer)}`;
}
code += ';';
return code;
}
visitExpressionStatement(node) {
return `${this.visit(node.expression)};`;
}
visitIdentifier(node) {
return node.name;
}
visitLiteral(node) {
if (typeof node.value === 'string') {
return `"${node.value}"`;
}
return String(node.value);
}
visitBlockStatement(node) {
let code = '{\n';
this.indentLevel++;
for (const statement of node.body) {
code += this.indent();
code += this.visit(statement);
if (!code.endsWith('\n')) {
code += '\n';
}
}
this.indentLevel--;
code += this.indent() + '}';
return code;
}
visitGroupExpression(node) {
return `(${this.visit(node.expression)})`;
}
visit(node) {
switch (node.type) {
case 'Program': return this.visitProgram(node);
case 'BinaryExpression': return this.visitBinaryExpression(node);
case 'VariableDeclaration': return this.visitVariableDeclaration(node);
case 'ExpressionStatement': return this.visitExpressionStatement(node);
case 'Identifier': return this.visitIdentifier(node);
case 'Literal': return this.visitLiteral(node);
case 'BlockStatement': return this.visitBlockStatement(node);
case 'GroupExpression': return this.visitGroupExpression(node);
default:
throw new Error(`Unknown node type: ${node.type}`);
}
}
generate(ast) {
return this.visit(ast);
}
}
// Example usage
const sourceCode = `
let x = 10;
let y = (x + 5) * 2;
`;
// Parse and generate code
const lexer = new Lexer(sourceCode); // Assuming Lexer is defined
const tokens = lexer.tokenize();
const parser = new Parser(tokens); // Assuming Parser is defined
const ast = parser.parse();
const generator = new CodeGenerator();
const generatedCode = generator.generate(ast);
console.log('Generated code:');
console.log(generatedCode);
Generating Low-Level Code: Assembly Example
To generate assembly code, we'd modify our visitor to output assembly instructions. Here's a simplified example targeting a fictional assembly language:
class AssemblyGenerator {
constructor() {
this.assembly = [];
this.tempVarCounter = 0;
this.variables = new Map(); // Maps variable names to memory locations
}
generateTempVar() {
return `t${this.tempVarCounter++}`;
}
visitProgram(node) {
this.assembly.push('.data');
// Allocate memory for variables
for (const statement of node.body) {
if (statement.type === 'VariableDeclaration') {
this.variables.set(statement.name, statement.name);
this.assembly.push(`${statement.name}: .word 0`);
}
}
this.assembly.push('.text');
this.assembly.push('main:');
// Generate code for statements
for (const statement of node.body) {
this.visit(statement);
}
// Exit program
this.assembly.push(' li $v0, 10');
this.assembly.push(' syscall');
return this.assembly.join('\n');
}
visitBinaryExpression(node) {
const left = this.visit(node.left);
const right = this.visit(node.right);
const result = this.generateTempVar();
switch (node.operator) {
case '+':
this.assembly.push(` add ${result}, ${left}, ${right}`);
break;
case '-':
this.assembly.push(` sub ${result}, ${left}, ${right}`);
break;
case '*':
this.assembly.push(` mul ${result}, ${left}, ${right}`);
break;
case '/':
this.assembly.push(` div ${result}, ${left}, ${right}`);
break;
}
return result;
}
visitVariableDeclaration(node) {
if (node.initializer) {
const value = this.visit(node.initializer);
const varName = this.variables.get(node.name);
this.assembly.push(` sw ${value}, ${varName}`);
}
}
visitIdentifier(node) {
const tempVar = this.generateTempVar();
const varName = this.variables.get(node.name);
this.assembly.push(` lw ${tempVar}, ${varName}`);
return tempVar;
}
visitLiteral(node) {
const tempVar = this.generateTempVar();
this.assembly.push(` li ${tempVar}, ${node.value}`);
return tempVar;
}
visit(node) {
switch (node.type) {
case 'Program': return this.visitProgram(node);
case 'BinaryExpression': return this.visitBinaryExpression(node);
case 'VariableDeclaration': return this.visitVariableDeclaration(node);
case 'Identifier': return this.visitIdentifier(node);
case 'Literal': return this.visitLiteral(node);
// Handle other node types...
default:
throw new Error(`Unknown node type: ${node.type}`);
}
}
generate(ast) {
return this.visit(ast);
}
}
Target Code Optimization
After generating code, further optimizations can be applied:
Peephole Optimization Example
Peephole optimization looks at small sequences of target code instructions (the "peephole") and replaces them with more efficient sequences:
function peepholeOptimize(assembly) {
const lines = assembly.split('\n');
const optimized = [];
for (let i = 0; i < lines.length; i++) {
// Skip redundant load/store pairs
if (i < lines.length - 1 &&
lines[i].match(/\s+sw\s+(\$\w+),\s+(\w+)/) &&
lines[i+1].match(/\s+lw\s+(\$\w+),\s+(\w+)/)) {
const storeParts = lines[i].match(/\s+sw\s+(\$\w+),\s+(\w+)/);
const loadParts = lines[i+1].match(/\s+lw\s+(\$\w+),\s+(\w+)/);
if (storeParts[2] === loadParts[2]) {
// If loading from the same memory location just stored to,
// replace with a move instruction
optimized.push(` move ${loadParts[1]}, ${storeParts[1]}`);
i++; // Skip the next line
continue;
}
}
// Replace multiply by 2 with shift left by 1
const mulBy2 = lines[i].match(/\s+mul\s+(\$\w+),\s+(\$\w+),\s+2/);
if (mulBy2) {
optimized.push(` sll ${mulBy2[1]}, ${mulBy2[2]}, 1`);
continue;
}
// Add the line unchanged if no optimization applied
optimized.push(lines[i]);
}
return optimized.join('\n');
}
Code Generation Process Visualization
Generating Code for Different Target Architectures
Just-In-Time (JIT) Compilation
JIT compilers generate native code at runtime rather than ahead of time:
class JITCompiler {
compile(sourceCode) {
const lexer = new Lexer(sourceCode);
const tokens = lexer.tokenize();
const parser = new Parser(tokens);
const ast = parser.parse();
// First generate bytecode (intermediate representation)
const bytecodeGenerator = new BytecodeGenerator();
const bytecode = bytecodeGenerator.generate(ast);
// Then compile hot parts to native code at runtime
return this.executeWithProfiling(bytecode);
}
executeWithProfiling(bytecode) {
// Execute bytecode while collecting profiling data
const profiler = new Profiler();
const interpreter = new Interpreter(bytecode, profiler);
// During execution, compile hot functions to native code
interpreter.onHotspot = (function_, profile) => {
const nativeCode = this.generateNativeCode(function_, profile);
interpreter.replaceWithNative(function_, nativeCode);
};
return interpreter.execute();
}
generateNativeCode(bytecodeFunction, profile) {
// Use profiling information for optimized native code generation
// This would use platform-specific code generation APIs
const specializer = new CodeSpecializer(profile);
return specializer.specialize(bytecodeFunction);
}
}
Conclusion
Code generation is the final stage in the compilation process, transforming the AST or intermediate representation into executable code. Whether targeting assembly code, machine code, or another high-level language, the principles remain the same: map the program's semantics to the target platform's execution model.
Throughout this compiler design guide, we've explored the complete pipeline from lexical analysis to code generation. Building a compiler requires understanding each of these phases and how they interact to transform source code into executable code.
The field of compiler design continues to evolve with new optimization techniques, target architectures, and programming language features. The fundamentals covered in this guide provide a solid foundation for exploring these advanced topics.