前言
JavaScript源码经过了词法分析和语法分析的处理就产生了语法分析树,接下来的工作就是将语法分析树转换为字节码,也就是本文将要讲述的内容。上述的整个过程实际上就是任何一门程序语言必经的步骤——编译。但是不同于许多语言(C/C++/Objective-C),JavaScript编译结束之后,并不会生成存放在硬盘之中的目标代码或可执行文件,生成的指令字节码,可能会立即被虚拟机进行逐行解释执行,也有可能被缓存下来通过JIT技术转换为本地代码。
CodeCache
在开篇中,我们提到如下的图:
当语法分析结束后,就该CodeCache
大显身手了,从名字就可以推断出,它负责CodeBlock的缓存处理
class CodeCache {
public:
CodeCache();
~CodeCache();
UnlinkedProgramCodeBlock* getProgramCodeBlock(VM&, ProgramExecutable*, const SourceCode&, JSParserBuiltinMode, JSParserStrictMode, DebuggerMode, ParserError&);
UnlinkedEvalCodeBlock* getEvalCodeBlock(VM&, EvalExecutable*, const SourceCode&, JSParserBuiltinMode, JSParserStrictMode, DebuggerMode, ParserError&, EvalContextType, const VariableEnvironment*);
UnlinkedModuleProgramCodeBlock* getModuleProgramCodeBlock(VM&, ModuleProgramExecutable*, const SourceCode&, JSParserBuiltinMode, DebuggerMode, ParserError&);
UnlinkedFunctionExecutable* getFunctionExecutableFromGlobalCode(VM&, const Identifier&, const SourceCode&, ParserError&);
private:
template <class UnlinkedCodeBlockType, class ExecutableType>
UnlinkedCodeBlockType* getGlobalCodeBlock(VM&, ExecutableType*, const SourceCode&, JSParserBuiltinMode, JSParserStrictMode, DebuggerMode, ParserError&, EvalContextType, const VariableEnvironment*);
CodeCacheMap m_sourceCode;
};
m_sourceCode
是容器,用来存储CodeBlock
CodeCache
有4个getXXXCodeBlock,用来获取各种CodeBlock
,其中前3个都会调到getGlobalCodeBlock
方法:
template <class UnlinkedCodeBlockType, class ExecutableType>
UnlinkedCodeBlockType* CodeCache::getGlobalCodeBlock(VM& vm, ExecutableType* executable, const SourceCode& source, JSParserBuiltinMode builtinMode, JSParserStrictMode strictMode, DebuggerMode debuggerMode, ParserError& error, EvalContextType evalContextType, const VariableEnvironment* variablesUnderTDZ)
{
...
// 生成Key
SourceCodeKey key(source, String(), CacheTypes<UnlinkedCodeBlockType>::codeType, builtinMode, strictMode, derivedContextType, evalContextType, isArrowFunctionContext);
// 查找缓存
SourceCodeValue* cache = m_sourceCode.findCacheAndUpdateAge(key);
// 如果命中,直接返回UnlinkedCodeBlock
if (cache && canCache) {
UnlinkedCodeBlockType* unlinkedCodeBlock = jsCast<UnlinkedCodeBlockType*>(cache->cell.get());
...
return unlinkedCodeBlock;
}
...
// 否则,定义一个UnlinkedCodeBlock,存储`BytecodeGenerator`生成的字节码
UnlinkedCodeBlockType* unlinkedCodeBlock = UnlinkedCodeBlockType::create(&vm, executable->executableInfo(), debuggerMode);
unlinkedCodeBlock->recordParse(rootNode->features(), rootNode->hasCapturedVariables(), rootNode->firstLine() - source.firstLine(), lineCount, unlinkedEndColumn);
unlinkedCodeBlock->setSourceURLDirective(source.provider()->sourceURL());
unlinkedCodeBlock->setSourceMappingURLDirective(source.provider()->sourceMappingURL());
// 调用`BytecodeGenerator`生成字节码
error = BytecodeGenerator::generate(vm, rootNode.get(), unlinkedCodeBlock, debuggerMode, variablesUnderTDZ);
// 添加缓存,用于后续查找
m_sourceCode.addCache(key, SourceCodeValue(vm, unlinkedCodeBlock, m_sourceCode.age()));
// 返回UnlinkedCodeBlock
return unlinkedCodeBlock;
}
getFunctionExecutableFromGlobalCode
比较特殊:
UnlinkedFunctionExecutable* CodeCache::getFunctionExecutableFromGlobalCode(VM& vm, const Identifier& name, const SourceCode& source, ParserError& error)
{
bool isArrowFunctionContext = false;
// 定义Key
SourceCodeKey key(
source, name.string(), SourceCodeType::FunctionType,
JSParserBuiltinMode::NotBuiltin,
JSParserStrictMode::NotStrict,
DerivedContextType::None,
EvalContextType::None,
isArrowFunctionContext);
// 查找缓存
SourceCodeValue* cache = m_sourceCode.findCacheAndUpdateAge(key);
// 如果命中
if (cache) {
UnlinkedFunctionExecutable* executable = jsCast<UnlinkedFunctionExecutable*>(cache->cell.get());
source.provider()->setSourceURLDirective(executable->sourceURLDirective());
source.provider()->setSourceMappingURLDirective(executable->sourceMappingURLDirective());
return executable;
}
JSTextPosition positionBeforeLastNewline;
// 生成ProgramNode
std::unique_ptr<ProgramNode> program = parse<ProgramNode>(
&vm, source, Identifier(), JSParserBuiltinMode::NotBuiltin,
JSParserStrictMode::NotStrict, SourceParseMode::ProgramMode, SuperBinding::NotNeeded,
error, &positionBeforeLastNewline);
if (!program) {
return nullptr;
}
// This function assumes an input string that would result in a single function declaration.
StatementNode* statement = program->singleStatement();
if (!statement || !statement->isBlock())
return nullptr;
StatementNode* funcDecl = static_cast<BlockNode*>(statement)->singleStatement();
if (!funcDecl || !funcDecl->isFuncDeclNode())
return nullptr;
FunctionMetadataNode* metadata = static_cast<FuncDeclNode*>(funcDecl)->metadata();
if (!metadata)
return nullptr;
metadata->overrideName(name);
metadata->setEndPosition(positionBeforeLastNewline);
// The Function constructor only has access to global variables, so no variables will be under TDZ.
VariableEnvironment emptyTDZVariables;
// 定义UnlinkedFunctionExecutable
UnlinkedFunctionExecutable* functionExecutable = UnlinkedFunctionExecutable::create(&vm, source, metadata, UnlinkedNormalFunction, ConstructAbility::CanConstruct, emptyTDZVariables, DerivedContextType::None);
functionExecutable->setSourceURLDirective(source.provider()->sourceURL());
functionExecutable->setSourceMappingURLDirective(source.provider()->sourceMappingURL());
// 添加缓存
m_sourceCode.addCache(key, SourceCodeValue(vm, functionExecutable, m_sourceCode.age()));
// 返回
return functionExecutable;
}
它并没有返回CodeBlock
,而是返回一个UnlinkedFunctionExecutable
,然而在UnlinkedFunctionExecutable
内部的方法generateUnlinkedFunctionCodeBlock
实际上也调用了BytecodeGenerator::generate
:
static UnlinkedFunctionCodeBlock* generateUnlinkedFunctionCodeBlock(
VM& vm, UnlinkedFunctionExecutable* executable, const SourceCode& source,
CodeSpecializationKind kind, DebuggerMode debuggerMode,
UnlinkedFunctionKind functionKind, ParserError& error, SourceParseMode parseMode)
{
JSParserBuiltinMode builtinMode = executable->isBuiltinFunction() ? JSParserBuiltinMode::Builtin : JSParserBuiltinMode::NotBuiltin;
JSParserStrictMode strictMode = executable->isInStrictContext() ? JSParserStrictMode::Strict : JSParserStrictMode::NotStrict;
std::unique_ptr<FunctionNode> function = parse<FunctionNode>(
&vm, source, executable->name(), builtinMode, strictMode, executable->parseMode(), executable->superBinding(), error, nullptr);
if (!function) {
return nullptr;
}
function->finishParsing(executable->name(), executable->functionMode());
executable->recordParse(function->features(), function->hasCapturedVariables());
bool isClassContext = executable->superBinding() == SuperBinding::Needed;
// 生成一个UnlinkedFunctionCodeBlock
UnlinkedFunctionCodeBlock* result = UnlinkedFunctionCodeBlock::create(&vm, FunctionCode, ExecutableInfo(function->usesEval(), function->isStrictMode(), kind == CodeForConstruct, functionKind == UnlinkedBuiltinFunction, executable->constructorKind(), executable->superBinding(), parseMode, executable->derivedContextType(), false, isClassContext, EvalContextType::FunctionEvalContext), debuggerMode);
// 调用`BytecodeGenerator`生成字节码
error = BytecodeGenerator::generate(vm, function.get(), result, debuggerMode, executable->parentScopeTDZVariables());
if (error.isValid()) return nullptr;
return result;
}
最终下来,BytecodeGenerator
的探索显得格外重要了
BytecodeGenerator
BytecodeGenerator
名如其意:字节码生成器。它对外有一个类方法generate
,里面调用了它的构造函数生成一个临时的BytecodeGenerator
变量,并调用其不带参数的generate
方法返回是否是解析错误,而真正的字节码结果存放在第3个参数CodeBlock
中
template<typename... Args>
static ParserError generate(VM& vm, Args&& ...args)
{
auto bytecodeGenerator = std::make_unique<BytecodeGenerator>(vm, std::forward<Args>(args)...);
return bytecodeGenerator->generate();
}
接下来看下它的几个私有构造函数:
class BytecodeGenerator
{
BytecodeGenerator(VM&, ProgramNode*, UnlinkedProgramCodeBlock*, DebuggerMode, const VariableEnvironment*);
BytecodeGenerator(VM&, FunctionNode*, UnlinkedFunctionCodeBlock*, DebuggerMode, const VariableEnvironment*);
BytecodeGenerator(VM&, EvalNode*, UnlinkedEvalCodeBlock*, DebuggerMode, const VariableEnvironment*);
BytecodeGenerator(VM&, ModuleProgramNode*, UnlinkedModuleProgramCodeBlock*, DebuggerMode, const VariableEnvironment*);
~BytecodeGenerator();
...
private:
Strong<UnlinkedCodeBlock> m_codeBlock;
}
可以看到,BytecodeGenerator
需要注入以下几个参数,其中除了DebuggerMode外,任何一个参数,都可能是一个篇幅讲述不完的,所以尽量讲的简要一点。
- VM 虚拟机
- Node,有
ProgramNode
、FunctionNode
、EvalNode
和ModuleProgramNode
4种UnlinkedCodeBlock
,和Node相对应,有UnlinkedProgramCodeBlock
、UnlinkedFunctionCodeBlock
、UnlinkedEvalCodeBlock
和UnlinkedModuleProgramCodeBlock
4种- DebuggerMode 是否是调试模式
- VariableEnvironment 变量环境
参数分析
VM
如枚举VMType
所示,VM有3种类型:
enum VMType { Default, APIContextGroup, APIShared }`
// WebCore has a one-to-one mapping of threads to VMs;
// either create() or createLeaked() should only be called once
// on a thread, this is the 'default' VM .
// API contexts created using the new context group aware interface
// create APIContextGroup objects which require less locking of JSC
// than the old singleton APIShared VM created for use by
// the original API.
VM的定义如下:
其中构造函数是私有的,外部不可访问,另外还罗列了一些我认为重要的成员变量
class VM : public ThreadSafeRefCounted<VM> {
public:
// VM的APIShared创建方式:单例
bool isSharedInstance() { return vmType == APIShared; }
bool usingAPI() { return vmType != Default; }
JS_EXPORT_PRIVATE static bool sharedInstanceExists();
JS_EXPORT_PRIVATE static VM& sharedInstance();
// VM的Default创建方式
JS_EXPORT_PRIVATE static Ref<VM> create(HeapType = SmallHeap);
JS_EXPORT_PRIVATE static Ref<VM> createLeaked(HeapType = SmallHeap);
// VM的APIContextGroup创建方式
static Ref<VM> createContextGroup(HeapType = SmallHeap);
...
private:
// 锁
RefPtr<JSLock> m_apiLock;
public:
ExecutableAllocator executableAllocator;
Heap heap;
VMType vmType;
VMEntryFrame* topVMEntryFrame;
ExecState* topCallFrame;
// a large numer of Structure
Strong<Structure> structureStructure;
Strong<Structure> structureRareDataStructure;
Strong<Structure> terminatedExecutionErrorStructure;
Strong<Structure> stringStructure;
Strong<Structure> propertyNameIteratorStructure;
Strong<Structure> propertyNameEnumeratorStructure;
Strong<Structure> customGetterSetterStructure;
Strong<Structure> scopedArgumentsTableStructure;
Strong<Structure> apiWrapperStructure;
Strong<Structure> JSScopeStructure;
Strong<Structure> executableStructure;
Strong<Structure> nativeExecutableStructure;
Strong<Structure> evalExecutableStructure;
Strong<Structure> programExecutableStructure;
Strong<Structure> functionExecutableStructure;
...
PrototypeMap prototypeMap;
SourceProviderCacheMap sourceProviderCacheMap;
Interpreter* interpreter;
...
ExecState* newCallFrameReturnValue;
ExecState* callFrameForCatch;
...
ExecState
是执行脚本的对象,由GlobalObject
管理的,负责记录脚本执行上下文
Node
在语法分析篇中,我们提到了Node
的继承图,其中本文提到的4中Node
均继承自ScopeNode
,而ScopeNode
继承自StatementNode
,显然这4种Node
属于基本语句,并且和区域范围有关。
UnlinkedCodeBlock
UnlinkedCodeBlock
是代码管理类,主要有UnlinkedProgramCodeBlock
、UnlinkedFunctionCodeBlock
、UnlinkedEvalCodeBlock
和UnlinkedModuleProgramCodeBlock
4种。它存储的是 编译后的ByteCode
,它的继承层级如下图所示:
与之对应的还有一个CodeBlock
,主要用于LLint
和JIT
,它的继承层级和UnlinkedCodeBlock
极其类似,我们后续的系列篇章中会提到。
VariableEnvironment
VariableEnvironment
有一个非常重要的成员变量m_map
,它用来存放各种被标记的变量:
Map m_map;
它的key为UniquedStringImpl*
类型,value为枚举类型Traits
enum Traits : uint16_t
{
IsCaptured = 1 << 0,
IsConst = 1 << 1,
IsVar = 1 << 2,
IsLet = 1 << 3,
IsExported = 1 << 4,
IsImported = 1 << 5,
IsImportedNamespace = 1 << 6,
IsFunction = 1 << 7,
IsParameter = 1 << 8,
IsSloppyModeHoistingCandidate = 1 << 9
};
同时,它还有若干重要的方法,用来捕获和标记变量
bool hasCapturedVariables() const;
bool captures(UniquedStringImpl* identifier) const;
void markVariableAsCapturedIfDefined(const RefPtr<UniquedStringImpl>& identifier);
void markVariableAsCaptured(const RefPtr<UniquedStringImpl>& identifier);
void markAllVariablesAsCaptured();
void markVariableAsImported(const RefPtr<UniquedStringImpl>& identifier);
void markVariableAsExported(const RefPtr<UniquedStringImpl>& identifier);
从这些方法名和枚举值可以简单地推测出:它主要用来标识变量的上下文,例如是否是闭包中捕获的变量,是否是导入的变量等
generate
接下来就该分析核心函数generate
了:
ParserError BytecodeGenerator::generate()
{
// 设置this寄存器
m_codeBlock->setThisRegister(m_thisRegister.virtualRegister());
// 设置arguments特殊参数和普通的参数
if (m_needToInitializeArguments)
initializeVariable(variable(propertyNames().arguments), m_argumentsRegister);
if (m_restParameter)
m_restParameter->emit(*this);
...
// 如果不是构造函数,调用emitBytecode生成字节码(重点)
bool callingClassConstructor = constructorKind() != ConstructorKind::None && !isConstructor();
if (!callingClassConstructor)
m_scopeNode->emitBytecode(*this);
....
// 将生成的字节码指令设置到m_codeBlock
m_codeBlock->setInstructions(std::make_unique<UnlinkedInstructionStream>(m_instructions));
m_codeBlock->shrinkToFit();
// 异常处理
if (m_expressionTooDeep)
return ParserError(ParserError::OutOfMemory);
return ParserError(ParserError::ErrorNone);
}
emitBytecode
前文提到的4种Node均继承自ScopeNode
,所以这里展开多态性分析:
ProgramNode
void ProgramNode::emitBytecode(BytecodeGenerator& generator, RegisterID*)
{
emitProgramNodeBytecode(generator, *this);
}
static void emitProgramNodeBytecode(BytecodeGenerator& generator, ScopeNode& scopeNode)
{
RefPtr<RegisterID> dstRegister = generator.newTemporary();
generator.emitLoad(dstRegister.get(), jsUndefined());
generator.emitProfileControlFlow(scopeNode.startStartOffset());
scopeNode.emitStatementsBytecode(generator, dstRegister.get());
generator.emitEnd(dstRegister.get());
}
调用堆栈如下:
ProgramNode::emitBytecode
└──emitProgramNodeBytecode
└──emitStatementsBytecode
ModuleProgramNode
void ModuleProgramNode::emitBytecode(BytecodeGenerator& generator, RegisterID*)
{
emitProgramNodeBytecode(generator, *this);
}
可以看到ModuleProgramNode
和ProgramNode
两者的emitBytecode
调用堆栈是一致的,这里不再重复分析。
调用堆栈如下:
ModuleProgramNode::emitBytecode
└──emitProgramNodeBytecode
└──emitStatementsBytecode
EvalNode
void EvalNode::emitBytecode(BytecodeGenerator& generator, RegisterID*)
{
RefPtr<RegisterID> dstRegister = generator.newTemporary();
generator.emitLoad(dstRegister.get(), jsUndefined());
emitStatementsBytecode(generator, dstRegister.get());
generator.emitEnd(dstRegister.get());
}
调用堆栈如下:
EvalNode::emitBytecode
└──emitStatementsBytecode
FunctionNode
FunctionNode
的字节码生成比较麻烦,它区分了几种解析模式:
enum class SourceParseMode : uint8_t {
NormalFunctionMode,
GeneratorBodyMode,
GeneratorWrapperFunctionMode,
GetterMode,
SetterMode,
MethodMode,
ArrowFunctionMode,
ProgramMode,
ModuleAnalyzeMode,
ModuleEvaluateMode
};
其中前7种都算是Funtion的范畴,这里针对GeneratorWrapperFunctionMode
和GeneratorBodyMode
做了单独解析处理,其它的走默认解析方式
void FunctionNode::emitBytecode(BytecodeGenerator& generator, RegisterID*)
{
...
switch (generator.parseMode())
{
case SourceParseMode::GeneratorWrapperFunctionMode:
{
StatementNode* singleStatement = this->singleStatement();
ExprStatementNode* exprStatement = static_cast<ExprStatementNode*>(singleStatement);
ExpressionNode* expr = exprStatement->expr();
FuncExprNode* funcExpr = static_cast<FuncExprNode*>(expr);
RefPtr<RegisterID> next = generator.newTemporary();
generator.emitNode(next.get(), funcExpr);
if (generator.superBinding() == SuperBinding::Needed) {
RefPtr<RegisterID> homeObject = emitHomeObjectForCallee(generator);
emitPutHomeObject(generator, next.get(), homeObject.get());
}
// FIXME: Currently, we just create an object and store generator related fields as its properties for ease.
// But to make it efficient, we will introduce JSGenerator class, add opcode new_generator and use its C++ fields instead of these private properties.
// https://bugs.webkit.org/show_bug.cgi?id=151545
generator.emitDirectPutById(generator.generatorRegister(), generator.propertyNames().generatorNextPrivateName, next.get(), PropertyNode::KnownDirect);
generator.emitDirectPutById(generator.generatorRegister(), generator.propertyNames().generatorThisPrivateName, generator.thisRegister(), PropertyNode::KnownDirect);
RegisterID* initialState = generator.emitLoad(nullptr, jsNumber(0));
generator.emitDirectPutById(generator.generatorRegister(), generator.propertyNames().generatorStatePrivateName, initialState, PropertyNode::KnownDirect);
generator.emitDirectPutById(generator.generatorRegister(), generator.propertyNames().generatorFramePrivateName, generator.emitLoad(nullptr, jsNull()), PropertyNode::KnownDirect);
generator.emitReturn(generator.generatorRegister());
break;
}
case SourceParseMode::GeneratorBodyMode:
{
RefPtr<Label> generatorBodyLabel = generator.newLabel();
{
RefPtr<RegisterID> condition = generator.newTemporary();
generator.emitEqualityOp(op_stricteq, condition.get(), generator.generatorResumeModeRegister(), generator.emitLoad(nullptr, jsNumber(static_cast<int32_t>(JSGeneratorFunction::GeneratorResumeMode::NormalMode))));
generator.emitJumpIfTrue(condition.get(), generatorBodyLabel.get());
RefPtr<Label> throwLabel = generator.newLabel();
generator.emitEqualityOp(op_stricteq, condition.get(), generator.generatorResumeModeRegister(), generator.emitLoad(nullptr, jsNumber(static_cast<int32_t>(JSGeneratorFunction::GeneratorResumeMode::ThrowMode))));
generator.emitJumpIfTrue(condition.get(), throwLabel.get());
generator.emitReturn(generator.generatorValueRegister());
generator.emitLabel(throwLabel.get());
generator.emitThrow(generator.generatorValueRegister());
}
generator.emitLabel(generatorBodyLabel.get());
emitStatementsBytecode(generator, generator.ignoredResult());
RefPtr<Label> done = generator.newLabel();
generator.emitLabel(done.get());
generator.emitReturn(generator.emitLoad(nullptr, jsUndefined()));
generator.endGenerator(done.get());
break;
}
default:
{
// 再次走到了emitStatementsBytecode
emitStatementsBytecode(generator, generator.ignoredResult());
StatementNode* singleStatement = this->singleStatement();
ReturnNode* returnNode = 0;
// Check for a return statement at the end of a function composed of a single block.
if (singleStatement && singleStatement->isBlock()) {
StatementNode* lastStatementInBlock = static_cast<BlockNode*>(singleStatement)->lastStatement();
if (lastStatementInBlock && lastStatementInBlock->isReturnNode())
returnNode = static_cast<ReturnNode*>(lastStatementInBlock);
}
// If there is no return we must automatically insert one.
if (!returnNode)
{
if (generator.constructorKind() == ConstructorKind::Derived && generator.needsToUpdateArrowFunctionContext() && generator.isSuperCallUsedInInnerArrowFunction())
generator.emitLoadThisFromArrowFunctionLexicalEnvironment(); // Arrow function can invoke 'super' in constructor and before leave constructor we need load 'this' from lexical arrow function environment
RegisterID* r0 = generator.isConstructor() ? generator.thisRegister() : generator.emitLoad(0, jsUndefined());
generator.emitProfileType(r0, ProfileTypeBytecodeFunctionReturnStatement); // Do not emit expression info for this profile because it's not in the user's source code.
generator.emitReturn(r0);
return;
}
break;
}
}
}
GeneratorWrapperFunctionMode
和GeneratorBodyMode
都包含了Generator
,推测应该和ES6推出的Genetator 函数有关,这里简单提一下:
形式上,Generator 函数是一个普通函数,但是有两个特征。一是,function关键字与函数名之间有一个星号;二是,函数体内部使用yield表达式,定义不同的内部状态.
GeneratorBodyMode
样式如下:
function* helloWorldGenerator() {
yield 'hello';
yield 'world';
return 'ending';
}
var hw = helloWorldGenerator();
GeneratorWrapperFunctionMode
的样式如下:
function *gen(a, b = hello())
{
return
{
@generatorNext: function (@generator, @generatorState, @generatorValue, @generatorResumeMode)
{
arguments; // This `arguments` should reference to the gen's arguments.
...
}
}
}
简单认识一下,这里我们不会深究,因为我们最终关心的是字节码的生成。可以看到,不管是以上哪种Node的emitBytecode
,最后都不可避免地走到了emitStatementsBytecode
。毕竟不管是ProgramNode
、EvalNode
,还是FunctionNode
,都是由一些基本的语句组成的,因此Bytecode
的生成最终会和语法分析篇中提到的各种Node
扯上关系。同时我们也可以合理推测到:emitStatementsBytecode
只是一个起点,绝对不是终点,在里面可能会有各种递归循环调用在等待着我们。。。