Просмотр исходного кода

[MERGE #5153 @Penguinwizzard] Hoist speculation masking when possible

Merge pull request #5153 from Penguinwizzard:ponch_opt_clean

This change attempts to move speculation-related masking from the
inside of loops to the out-edges.

The bulk of the work happens in the dead store pass; I took advantage
of the multiple passes on loops to gather data about speculated uses,
and then the final dead store pass handles marking instructions which
we think are safe and storing the data for the Lowerer.

There's a new set of data structures introduced in ClusterList.h; the
purpose of these is to try to get relatively efficient set joins when
dealing with the sets of symbols here. There's a couple of parts that
may be a little tailored for this use case (segmentclusterlist is set
up for symbol clustering patterns and for two-pass runs), but I think
we might find another use for them sometime.

The Lowering code is pretty limited to handling the new opcode, which
has a list of symbols to mask, and expands to instructions that block
speculation on those symbols.
Derek Morris 7 лет назад
Родитель
Сommit
579267d697

+ 719 - 4
lib/Backend/BackwardPass.cpp

@@ -11,7 +11,9 @@ BackwardPass::BackwardPass(Func * func, GlobOpt * globOpt, Js::Phase tag)
     : func(func), globOpt(globOpt), tag(tag), currentPrePassLoop(nullptr), tempAlloc(nullptr),
     preOpBailOutInstrToProcess(nullptr),
     considerSymAsRealUseInNoImplicitCallUses(nullptr),
-    isCollectionPass(false), currentRegion(nullptr)
+    isCollectionPass(false), currentRegion(nullptr),
+    collectionPassSubPhase(CollectionPassSubPhase::None),
+    isLoopPrepass(false)
 {
     // Those are the only two phase dead store will be used currently
     Assert(tag == Js::BackwardPhase || tag == Js::DeadStorePhase);
@@ -47,7 +49,7 @@ BackwardPass::DoSetDead(IR::Opnd * opnd, bool isDead) const
 bool
 BackwardPass::DoByteCodeUpwardExposedUsed() const
 {
-    return 
+    return
         !this->func->GetJITFunctionBody()->IsAsmJsMode() &&
         (
             (this->tag == Js::DeadStorePhase && this->func->hasBailout) ||
@@ -555,6 +557,12 @@ BackwardPass::MergeSuccBlocksInfo(BasicBlock * block)
             {
                 Assert(blockSucc->GetDataUseCount() != 0);
                 deleteData = (blockSucc->DecrementDataUseCount() == 0);
+                if (blockSucc->GetFirstInstr()->m_next->m_opcode == Js::OpCode::SpeculatedLoadFence)
+                {
+                    // We hold on to data for these blocks until the arena gets cleared due to unusual data lifetimes.
+                    deleteData = false;
+                    blockSucc->IncrementDataUseCount();
+                }
             }
 
 #if DBG
@@ -633,6 +641,7 @@ BackwardPass::MergeSuccBlocksInfo(BasicBlock * block)
             Assert((blockSucc->tempObjectVerifyTracker != nullptr)
                 || (blockSucc->isLoopHeader && (this->IsPrePass() || blockSucc->loop->IsDescendentOrSelf(block->loop)))
                 || !this->DoMarkTempObjectVerify());
+
             if (blockSucc->upwardExposedUses != nullptr)
             {
                 upwardExposedUses->Or(blockSucc->upwardExposedUses);
@@ -1123,7 +1132,10 @@ BackwardPass::MergeSuccBlocksInfo(BasicBlock * block)
             // blocks with data intact. Delete the block data now.
             Assert(block->backwardPassCurrentLoop);
             Assert(block->backwardPassCurrentLoop->hasDeadStoreCollectionPass);
-            Assert(!block->backwardPassCurrentLoop->hasDeadStorePrepass);
+            // The two situations where we might be keeping data around are either before we do
+            // the prepass, or when we're storing the data because we have a speculation-cancel
+            // block, which has longer lifetimes for its data.
+            Assert(!block->backwardPassCurrentLoop->hasDeadStorePrepass || block->GetFirstInstr()->m_next->m_opcode == Js::OpCode::SpeculatedLoadFence);
 
             DeleteBlockData(block);
         }
@@ -1405,6 +1417,23 @@ BackwardPass::ProcessLoopCollectionPass(BasicBlock *const lastBlock)
     currentPrePassLoop = collectionPassLoop;
     Assert(IsPrePass());
 
+    // This is also the location where we do the additional step of tracking what opnds
+    // are used inside the loop in memory dereferences, and thus need masking for cache
+    // attacks (Spectre). This is a fairly conservative approach, where we just track a
+    // set of symbols which are determined by each other inside the loop. This lets the
+    // second pass later on determine if a particular operation generating a symbol can
+    // avoid the Spectre masking overhead, since a symbol not dereferenced in the loops
+    // can be masked on the out-edge of the loop, which should be significantly cheaper
+    // than masking it every iteration.
+    AssertMsg(collectionPassLoop->symClusterList == nullptr, "clusterList should not have been initialized yet!");
+    // This is needed to work around tokenization issues with preprocessor macros which
+    // present themselves when using multiple template parameters.
+#ifndef _M_ARM
+    typedef SegmentClusterList<SymID, JitArenaAllocator> symClusterListType;
+    collectionPassLoop->symClusterList = JitAnew(this->func->m_fg->alloc, symClusterListType, this->func->m_fg->alloc, 256);
+    collectionPassLoop->internallyDereferencedSyms = JitAnew(this->func->m_fg->alloc, BVSparse<JitArenaAllocator>, this->func->m_fg->alloc);
+#endif
+
     // First pass
     BasicBlock *firstInnerLoopHeader = nullptr;
     {
@@ -1415,6 +1444,10 @@ BackwardPass::ProcessLoopCollectionPass(BasicBlock *const lastBlock)
         }
 #endif
 
+        // We want to be able to disambiguate this in ProcessBlock
+        CollectionPassSubPhase prevCollectionPassSubPhase = this->collectionPassSubPhase;
+        this->collectionPassSubPhase = CollectionPassSubPhase::FirstPass;
+
         FOREACH_BLOCK_BACKWARD_IN_RANGE_DEAD_OR_ALIVE(block, lastBlock, nullptr)
         {
             ProcessBlock(block);
@@ -1431,6 +1464,8 @@ BackwardPass::ProcessLoopCollectionPass(BasicBlock *const lastBlock)
             }
         } NEXT_BLOCK_BACKWARD_IN_RANGE_DEAD_OR_ALIVE;
 
+        this->collectionPassSubPhase = prevCollectionPassSubPhase;
+
 #if DBG_DUMP
         if(IsTraceEnabled())
         {
@@ -1439,6 +1474,49 @@ BackwardPass::ProcessLoopCollectionPass(BasicBlock *const lastBlock)
 #endif
     }
 
+#ifndef _M_ARM
+    // Since we generated the base data structures for the spectre handling, we can now
+    // cross-reference them to get the full set of what may be dereferenced in the loop
+    // and what is safe in speculation.
+#if DBG_DUMP
+    if (PHASE_TRACE(Js::SpeculationPropagationAnalysisPhase, this->func))
+    {
+        Output::Print(_u("Analysis Results for loop %u:\n"), collectionPassLoop->GetLoopNumber());
+        Output::Print(_u("ClusterList pre-consolidation: "));
+        collectionPassLoop->symClusterList->Dump();
+    }
+#endif // DBG_DUMP
+    collectionPassLoop->symClusterList->Consolidate();
+#if DBG_DUMP
+    if (PHASE_TRACE(Js::SpeculationPropagationAnalysisPhase, this->func))
+    {
+        Output::Print(_u("ClusterList post-consolidation: "));
+        collectionPassLoop->symClusterList->Dump();
+        Output::Print(_u("Internally dereferenced syms pre-propagation: "));
+        collectionPassLoop->internallyDereferencedSyms->Dump();
+    }
+#endif // DBG_DUMP
+    collectionPassLoop->symClusterList->Map<BVSparse<JitArenaAllocator>*, true>([](SymID index, SymID containingSetRoot, BVSparse<JitArenaAllocator>* bv){
+        if (bv->Test(index))
+        {
+            bv->Set(containingSetRoot);
+        }
+    }, collectionPassLoop->internallyDereferencedSyms);
+    collectionPassLoop->symClusterList->Map<BVSparse<JitArenaAllocator>*, true>([](SymID index, SymID containingSetRoot, BVSparse<JitArenaAllocator>* bv){
+        if (bv->Test(containingSetRoot))
+        {
+            bv->Set(index);
+        }
+    }, collectionPassLoop->internallyDereferencedSyms);
+#if DBG_DUMP
+    if (PHASE_TRACE(Js::SpeculationPropagationAnalysisPhase, this->func))
+    {
+        Output::Print(_u("Internally dereferenced syms post-propagation: "));
+        collectionPassLoop->internallyDereferencedSyms->Dump();
+    }
+#endif // DBG_DUMP
+#endif // defined(_M_ARM)
+
     // Second pass, only needs to run if there are any inner loops, to propagate collected information into those loops
     if(firstInnerLoopHeader)
     {
@@ -1449,6 +1527,10 @@ BackwardPass::ProcessLoopCollectionPass(BasicBlock *const lastBlock)
         }
 #endif
 
+        // We want to be able to disambiguate this in ProcessBlock
+        CollectionPassSubPhase prevCollectionPassSubPhase = this->collectionPassSubPhase;
+        this->collectionPassSubPhase = CollectionPassSubPhase::SecondPass;
+
         FOREACH_BLOCK_BACKWARD_IN_RANGE_DEAD_OR_ALIVE(block, lastBlock, firstInnerLoopHeader)
         {
             Loop *const loop = block->loop;
@@ -1471,6 +1553,8 @@ BackwardPass::ProcessLoopCollectionPass(BasicBlock *const lastBlock)
             ProcessBlock(block);
         } NEXT_BLOCK_BACKWARD_IN_RANGE_DEAD_OR_ALIVE;
 
+        this->collectionPassSubPhase = prevCollectionPassSubPhase;
+
 #if DBG_DUMP
         if(IsTraceEnabled())
         {
@@ -1494,6 +1578,9 @@ BackwardPass::ProcessLoop(BasicBlock * lastBlock)
 
     Loop *loop = lastBlock->loop;
 
+    bool prevIsLoopPrepass = this->isLoopPrepass;
+    this->isLoopPrepass = true;
+
     // This code doesn't work quite as intended. It is meant to capture fields that are live out of a loop to limit the
     // number of implicit call bailouts the forward pass must create (only compiler throughput optimization, no impact
     // on emitted code), but because it looks only at the lexically last block in the loop, it does the right thing only
@@ -1560,6 +1647,8 @@ BackwardPass::ProcessLoop(BasicBlock * lastBlock)
     __analysis_assume(lastBlock);
     lastBlock->loop->hasDeadStorePrepass = true;
 
+    this->isLoopPrepass = prevIsLoopPrepass;
+
 #if DBG_DUMP
     if (this->IsTraceEnabled())
     {
@@ -2679,6 +2768,7 @@ BackwardPass::ProcessBlock(BasicBlock * block)
             }
         }
 
+
         if(!IsCollectionPass())
         {
             this->MarkTempProcessInstr(instr);
@@ -2711,11 +2801,540 @@ BackwardPass::ProcessBlock(BasicBlock * block)
 
         if(IsCollectionPass())
         {
+#ifndef _M_ARM
+            if (this->collectionPassSubPhase == CollectionPassSubPhase::FirstPass)
+            {
+                // In the collection pass we do multiple passes over loops. In these passes we keep
+                // track of sets of symbols, such that we can know whether or not they are used in
+                // ways that we need to protect them from side-channel attacks.
+                IR::Opnd const * src1 = instr->GetSrc1();
+                IR::Opnd const * src2 = instr->GetSrc2();
+                IR::Opnd const * dest = instr->GetDst();
+                // The marking is as follows, by default:
+                // 1. symbols on an instruction directly get marked as being part of the same set.
+                // 2. symbols used in indiropnds on an instruction get marked as being dereferenced.
+                // 3. symbols used as sources for some instructions get marked as being dereferenced.
+                // 4. non-type-specialized symbols tend to get marked as dereferenced.
+
+                // First, we need to find any symbol associated with this instruction as a targeted
+                // symid for the merge operations. This simplifies the later code.
+                auto getAnyDirectSymID = [](IR::Opnd const* opnd)
+                {
+                    SymID temp = SymID_Invalid;
+                    if (opnd == nullptr)
+                    {
+                        return temp;
+                    }
+
+                    switch (opnd->m_kind)
+                    {
+                    case IR::OpndKind::OpndKindInvalid:
+                        AssertOrFailFastMsg(false, "There should be no invalid operand kinds at this point...");
+                        break;
+                    case IR::OpndKind::OpndKindIntConst:
+                    case IR::OpndKind::OpndKindInt64Const:
+                    case IR::OpndKind::OpndKindFloatConst:
+                    case IR::OpndKind::OpndKindFloat32Const:
+                    case IR::OpndKind::OpndKindSimd128Const:
+                        // Nothing to do here, no symbols involved
+                        break;
+                    case IR::OpndKind::OpndKindHelperCall:
+                        // Nothing here either, I think?
+                        break;
+                    case IR::OpndKind::OpndKindSym:
+                        temp = opnd->AsSymOpnd()->m_sym->m_id;
+                        break;
+                    case IR::OpndKind::OpndKindReg:
+                        temp = opnd->AsRegOpnd()->m_sym->m_id;
+                        break;
+                    case IR::OpndKind::OpndKindAddr:
+                        // Should be constant, so nothing to do
+                        break;
+                    case IR::OpndKind::OpndKindIndir:
+                        // IndirOpnds don't themselves have symbols
+                        break;
+                    case IR::OpndKind::OpndKindLabel:
+                        // Should be constant, so not an issue
+                        break;
+                    case IR::OpndKind::OpndKindMemRef:
+                        // Should get a closer look, but looks ok?
+                        break;
+                    case IR::OpndKind::OpndKindRegBV:
+                        // Should be ok
+                        break;
+                    case IR::OpndKind::OpndKindList:
+                        // Since it's a list of RegOpnds, we just need to look at the first
+                        {
+                            IR::ListOpnd const* list = opnd->AsListOpnd();
+                            if (list->Count() > 0)
+                            {
+                                temp = list->Item(0)->m_sym->m_id;
+                            }
+                        }
+                        break;
+                    default:
+                        AssertOrFailFastMsg(false, "This should be unreachable - if we've added another OpndKind, add proper handling for it");
+                        break;
+                    }
+                    return temp;
+                };
+
+                SymID destSymID = getAnyDirectSymID(dest);
+
+                if (destSymID == SymID_Invalid)
+                {
+                    // It looks like we have no assignment to a symbol. As this pass is to mark the
+                    // symbols that are in the same set through assignment or computation, the lack
+                    // of a destination means that we don't have any set joins to do. We may need a
+                    // pass over the source operands to mark dereferences, but that's simpler.
+                }
+                else
+                {
+                    // We have a base, so now we want to go through and add any symbols to that set
+                    // if they're on the base level of operands on the function.
+                    auto addSymbolToSet = [](IR::Opnd const* opnd, Loop::LoopSymClusterList* scl, SymID targetSymID)
+                    {
+                        if (opnd == nullptr)
+                        {
+                            return;
+                        }
+                        switch (opnd->m_kind)
+                        {
+                        case IR::OpndKind::OpndKindInvalid:
+                            AssertOrFailFastMsg(false, "There should be no invalid operand kinds at this point...");
+                            break;
+                        case IR::OpndKind::OpndKindIntConst:
+                        case IR::OpndKind::OpndKindInt64Const:
+                        case IR::OpndKind::OpndKindFloatConst:
+                        case IR::OpndKind::OpndKindFloat32Const:
+                        case IR::OpndKind::OpndKindSimd128Const:
+                            // Nothing to do here, no symbols involved
+                            break;
+                        case IR::OpndKind::OpndKindHelperCall:
+                            // Nothing here either, I think?
+                            break;
+                        case IR::OpndKind::OpndKindSym:
+                            scl->Merge(targetSymID, opnd->AsSymOpnd()->m_sym->m_id);
+                            break;
+                        case IR::OpndKind::OpndKindReg:
+                            scl->Merge(targetSymID, opnd->AsRegOpnd()->m_sym->m_id);
+                            break;
+                        case IR::OpndKind::OpndKindAddr:
+                            // Should be constant, so nothing to do
+                            break;
+                        case IR::OpndKind::OpndKindIndir:
+                            // IndirOpnds don't themselves have symbols
+                            break;
+                        case IR::OpndKind::OpndKindLabel:
+                            // Should be constant, so not an issue
+                            break;
+                        case IR::OpndKind::OpndKindMemRef:
+                            // Should get a closer look, but looks ok?
+                            break;
+                        case IR::OpndKind::OpndKindRegBV:
+                            // Should be ok
+                            break;
+                        case IR::OpndKind::OpndKindList:
+                            // Needs iteration, but is straightforward beyond that
+                            {
+                                IR::ListOpnd const* list = opnd->AsListOpnd();
+                                for (int iter = 0; iter < list->Count(); iter++)
+                                {
+                                    scl->Merge(targetSymID, list->Item(iter)->m_sym->m_id);
+                                }
+                            }
+                            break;
+                        default:
+                            AssertOrFailFastMsg(false, "This should be unreachable - if we've added another OpndKind, add proper handling for it");
+                            break;
+                        }
+                    };
+                    addSymbolToSet(src1, this->currentPrePassLoop->symClusterList, destSymID);
+                    addSymbolToSet(src2, this->currentPrePassLoop->symClusterList, destSymID);
+                }
+
+                // Now we get to the second part - symbols used in indiropnds get marked as dereferenced
+                // This is just a matter of updating a bitvector, so it's fairly straightforward.
+                auto markDereferences = [](IR::Opnd const* opnd, BVSparse<JitArenaAllocator>* bv)
+                {
+                    if (opnd == nullptr)
+                    {
+                        return;
+                    }
+                    switch (opnd->m_kind)
+                    {
+                    case IR::OpndKind::OpndKindInvalid:
+                        AssertOrFailFastMsg(false, "There should be no invalid operand kinds at this point...");
+                        break;
+                    case IR::OpndKind::OpndKindIntConst:
+                    case IR::OpndKind::OpndKindInt64Const:
+                    case IR::OpndKind::OpndKindFloatConst:
+                    case IR::OpndKind::OpndKindFloat32Const:
+                    case IR::OpndKind::OpndKindSimd128Const:
+                        // Nothing to do here, no symbols involved
+                        break;
+                    case IR::OpndKind::OpndKindHelperCall:
+                        // Nothing here either, I think?
+                        break;
+                    case IR::OpndKind::OpndKindSym:
+                        // If it's not type-specialized, we may dereference it.
+                        if (!(opnd->GetValueType().IsNotObject()))
+                        {
+                            bv->Set(opnd->AsSymOpnd()->m_sym->m_id);
+                        }
+                        break;
+                    case IR::OpndKind::OpndKindReg:
+                        // If it's not type-specialized, we may dereference it.
+                        if (!(opnd->GetValueType().IsNotObject()) && !opnd->AsRegOpnd()->m_sym->IsTypeSpec())
+                        {
+                            bv->Set(opnd->AsRegOpnd()->m_sym->m_id);
+                        }
+                        break;
+                    case IR::OpndKind::OpndKindAddr:
+                        // Should be constant, so nothing to do
+                        break;
+                    case IR::OpndKind::OpndKindIndir:
+                        // Need to handle each component
+                        {
+                            IR::IndirOpnd const* indirOpnd = opnd->AsIndirOpnd();
+                            if (indirOpnd->GetBaseOpnd())
+                            {
+                                bv->Set(indirOpnd->GetBaseOpnd()->m_sym->m_id);
+                            }
+                            if (indirOpnd->GetIndexOpnd())
+                            {
+                                bv->Set(indirOpnd->GetIndexOpnd()->m_sym->m_id);
+                            }
+                        }
+                        break;
+                    case IR::OpndKind::OpndKindLabel:
+                        // Should be constant, so not an issue
+                        break;
+                    case IR::OpndKind::OpndKindMemRef:
+                        // Should get a closer look, but looks ok?
+                        break;
+                    case IR::OpndKind::OpndKindRegBV:
+                        // Should be ok
+                        break;
+                    case IR::OpndKind::OpndKindList:
+                        // Needs iteration, but is straightforward beyond that
+                        {
+                            IR::ListOpnd const* list = opnd->AsListOpnd();
+                            for (int iter = 0; iter < list->Count(); iter++)
+                            {
+                                // should be the same as OpndKindReg, since ListOpndType is RegOpnd
+                                if (!(list->Item(iter)->GetValueType().IsNotObject()) && !opnd->AsRegOpnd()->m_sym->IsTypeSpec())
+                                {
+                                    bv->Set(list->Item(iter)->m_sym->m_id);
+                                }
+                            }
+                        }
+                        break;
+                    default:
+                        AssertOrFailFastMsg(false, "This should be unreachable - if we've added another OpndKind, add proper handling for it");
+                        break;
+                    }
+                };
+                markDereferences(dest, this->currentPrePassLoop->internallyDereferencedSyms);
+                markDereferences(src1, this->currentPrePassLoop->internallyDereferencedSyms);
+                markDereferences(src2, this->currentPrePassLoop->internallyDereferencedSyms);
+
+                auto explicitlyMarkDereferenced = [](IR::Opnd const* opnd, BVSparse<JitArenaAllocator>* bv)
+                {
+                    if (opnd == nullptr)
+                    {
+                        return;
+                    }
+                    switch (opnd->m_kind)
+                    {
+                    case IR::OpndKind::OpndKindInvalid:
+                        AssertOrFailFastMsg(false, "There should be no invalid operand kinds at this point...");
+                        break;
+                    case IR::OpndKind::OpndKindIntConst:
+                    case IR::OpndKind::OpndKindInt64Const:
+                    case IR::OpndKind::OpndKindFloatConst:
+                    case IR::OpndKind::OpndKindFloat32Const:
+                    case IR::OpndKind::OpndKindSimd128Const:
+                        // Nothing to do here, no symbols involved
+                        break;
+                    case IR::OpndKind::OpndKindHelperCall:
+                        // Nothing here either, I think?
+                        break;
+                    case IR::OpndKind::OpndKindSym:
+                        // The instruction using this means that we may dereference the symbol,
+                        // regardless of type spec
+                        bv->Set(opnd->AsSymOpnd()->m_sym->m_id);
+                        break;
+                    case IR::OpndKind::OpndKindReg:
+                        // The instruction using this means that we may dereference the symbol,
+                        // regardless of type spec
+                        bv->Set(opnd->AsRegOpnd()->m_sym->m_id);
+                        break;
+                    case IR::OpndKind::OpndKindAddr:
+                        // Should be constant, so nothing to do
+                        break;
+                    case IR::OpndKind::OpndKindIndir:
+                        // Need to handle each component
+                    {
+                        IR::IndirOpnd const* indirOpnd = opnd->AsIndirOpnd();
+                        if (indirOpnd->GetBaseOpnd())
+                        {
+                            bv->Set(indirOpnd->GetBaseOpnd()->m_sym->m_id);
+                        }
+                        if (indirOpnd->GetIndexOpnd())
+                        {
+                            bv->Set(indirOpnd->GetIndexOpnd()->m_sym->m_id);
+                        }
+                    }
+                    break;
+                    case IR::OpndKind::OpndKindLabel:
+                        // Should be constant, so not an issue
+                        break;
+                    case IR::OpndKind::OpndKindMemRef:
+                        // Should get a closer look, but looks ok?
+                        break;
+                    case IR::OpndKind::OpndKindRegBV:
+                        // Should be ok
+                        break;
+                    case IR::OpndKind::OpndKindList:
+                        // Needs iteration, but is straightforward beyond that
+                    {
+                        IR::ListOpnd const* list = opnd->AsListOpnd();
+                        for (int iter = 0; iter < list->Count(); iter++)
+                        {
+                            // The instruction using this means that we may dereference the symbol,
+                            // regardless of type spec
+                            bv->Set(list->Item(iter)->m_sym->m_id);
+                        }
+                    }
+                    break;
+                    default:
+                        AssertOrFailFastMsg(false, "This should be unreachable - if we've added another OpndKind, add proper handling for it");
+                        break;
+                    }
+                };
+                // We may also have some specific instructions that dereference things - we can
+                // handle those specifically, since there's only a few of them
+                switch (instr->m_opcode)
+                {
+                case Js::OpCode::StArrInlineItem_CI4:
+                case Js::OpCode::StArrItemC_CI4:
+                case Js::OpCode::StArrItemI_CI4:
+                case Js::OpCode::StArrSegElemC:
+                case Js::OpCode::StArrSegItem_A:
+                case Js::OpCode::StArrSegItem_CI4:
+                case Js::OpCode::StArrViewElem:
+                case Js::OpCode::StAtomicWasm:
+                case Js::OpCode::StElemC:
+                case Js::OpCode::StElemI_A:
+                case Js::OpCode::StElemI_A_Strict:
+                case Js::OpCode::StEnvObjSlot:
+                case Js::OpCode::StEnvObjSlotChkUndecl:
+                case Js::OpCode::StFld:
+                case Js::OpCode::StFldStrict:
+                case Js::OpCode::StFuncExpr:
+                case Js::OpCode::StInnerObjSlot:
+                case Js::OpCode::StInnerObjSlotChkUndecl:
+                case Js::OpCode::StInnerSlot:
+                case Js::OpCode::StInnerSlotChkUndecl:
+                case Js::OpCode::StLocalFld:
+                case Js::OpCode::StLocalFuncExpr:
+                case Js::OpCode::StLocalObjSlot:
+                case Js::OpCode::StLocalObjSlotChkUndecl:
+                case Js::OpCode::StLocalSlot:
+                case Js::OpCode::StLocalSlotChkUndecl:
+                case Js::OpCode::StLoopBodyCount:
+                case Js::OpCode::StModuleSlot:
+                case Js::OpCode::StObjSlot:
+                case Js::OpCode::StObjSlotChkUndecl:
+                case Js::OpCode::StParamObjSlot:
+                case Js::OpCode::StParamObjSlotChkUndecl:
+                case Js::OpCode::StParamSlot:
+                case Js::OpCode::StParamSlotChkUndecl:
+                case Js::OpCode::StRootFld:
+                case Js::OpCode::StRootFldStrict:
+                case Js::OpCode::StSlot:
+                case Js::OpCode::StSlotBoxTemp:
+                case Js::OpCode::StSlotChkUndecl:
+                case Js::OpCode::StSuperFld:
+                case Js::OpCode::ProfiledStElemI_A:
+                case Js::OpCode::ProfiledStElemI_A_Strict:
+                case Js::OpCode::ProfiledStFld:
+                case Js::OpCode::ProfiledStFldStrict:
+                case Js::OpCode::ProfiledStLocalFld:
+                case Js::OpCode::ProfiledStRootFld:
+                case Js::OpCode::ProfiledStRootFldStrict:
+                case Js::OpCode::ProfiledStSuperFld:
+                    // Unfortunately, being fed into a store means that we could have aliasing, and the
+                    // consequence is that it may be re-read and then dereferenced. Note that we can do
+                    // this case if we poison any array symbol that we store to on the way out, but the
+                    // aliasing problem remains.
+                case Js::OpCode::ArgOut_A:
+                case Js::OpCode::ArgOut_ANonVar:
+                case Js::OpCode::ArgOut_A_Dynamic:
+                case Js::OpCode::ArgOut_A_FixupForStackArgs:
+                case Js::OpCode::ArgOut_A_FromStackArgs:
+                case Js::OpCode::ProfiledArgOut_A:
+                    // Getting passed to another function is a boundary that we can't analyze over.
+                case Js::OpCode::Ret:
+                    // Return arcs are pretty short in speculation, so we have to assume that we may be
+                    // returning to a situation that will dereference the symbol. Note that we will not
+                    // hit this path in normal jitted code, but it's more common in jitloopbody'd code.
+                    explicitlyMarkDereferenced(instr->GetSrc1(), this->currentPrePassLoop->internallyDereferencedSyms);
+                    break;
+                default:
+                    // most instructions don't have this sort of behavior
+                    break;
+                }
+            }
+#endif
+            // Continue normal CollectionPass behavior
             continue;
         }
 
         if (this->tag == Js::DeadStorePhase)
         {
+#ifndef _M_ARM
+            if(block->loop && !this->isLoopPrepass)
+            {
+                // In the second pass, we mark instructions that we go by as being safe or unsafe.
+                //
+                // This is all based on the information which we gathered in the previous pass. The
+                // symbol sets are cross-referenced and the bit-vector information is set such that
+                // the bit vector now holds a complete list of which symbols are dereferenced, both
+                // directly or indirectly, in the loop, so we can see if a particular instr creates
+                // such a symbol. If it doesn't, then we will not mask its destination, as it's not
+                // necessary to create a safe program.
+                //
+                // Note that if we avoiding doing the masking here, we need to instead do it on the
+                // out-edges of the loop - otherwise an unsafe use of the symbol could happen after
+                // the loop and not get caught.
+
+                // This helper goes through and marks loop out-edges for a particular symbol set.
+                static void (*addOutEdgeMasking)(SymID, Loop*, JitArenaAllocator*) = [](SymID symID, Loop* loop, JitArenaAllocator *alloc) -> void
+                {
+                    // There are rare cases where we have no out-edges (the only way to leave this loop
+                    // is via a return inside the jitloopbody); in this case, we don't need to mask any
+                    // symbols on the out-edges, as we only need to worry about the store cases.
+                    if(loop->outwardSpeculationMaskInstrs == nullptr)
+                    {
+                        return;
+                    }
+                    BVSparse<JitArenaAllocator> *syms = JitAnew(alloc, BVSparse<JitArenaAllocator>, alloc);
+                    // We only need to do this for stack syms, and only for ones that are upwardexposed
+                    // in the block sourcing to the masking block, but it needs to be for all symbols a
+                    // mask-skipped load may be written to.
+                    loop->symClusterList->MapSet<BVSparse<JitArenaAllocator>*>(symID, [](SymID a, BVSparse<JitArenaAllocator> *symbols) {
+                        symbols->Set(a);
+                    }, syms);
+                    FOREACH_BITSET_IN_SPARSEBV(curSymID, syms)
+                    {
+                        if (!loop->GetFunc()->m_symTable->Find(curSymID)->IsStackSym())
+                        {
+                            syms->Clear(curSymID);
+                        }
+                    } NEXT_BITSET_IN_SPARSEBV;
+                    if (syms->IsEmpty())
+                    {
+                        // If there's no non-stack symids, we have nothing to mask
+                        return;
+                    }
+                    // Now that we have a bitvector of things to try to mask on the out-edges, we'll go
+                    // over the list of outmask instructions.
+                    FOREACH_SLIST_ENTRY(IR::ByteCodeUsesInstr*, bcuInstr, loop->outwardSpeculationMaskInstrs)
+                    {
+                        // Get the upwardExposed information for the previous block
+                        IR::LabelInstr *blockLabel = bcuInstr->m_prev->AsLabelInstr();
+                        BasicBlock* maskingBlock = blockLabel->GetBasicBlock();
+                        // Since it's possible we have a multi-level loop structure (each with its own mask
+                        // instructions and dereferenced symbol list), we may be able to avoid masking some
+                        // symbols in interior loop->exterior loop edges if they're not dereferenced in the
+                        // exterior loop. This does mean, however, that we need to mask them further out.
+                        Loop* maskingBlockLoop = maskingBlock->loop;
+                        if (maskingBlockLoop != nullptr && !maskingBlockLoop->internallyDereferencedSyms->Test(symID))
+                        {
+                            addOutEdgeMasking(symID, maskingBlockLoop, alloc);
+                            continue;
+                        }
+                        // Instead of looking at the previous block (inside the loop), which may be cleaned
+                        // up or may yet be processed for dead stores, we instead can look at the mask/cmov
+                        // block, which we can keep from being cleaned up, and which will always be handled
+                        // before the loop is looked at (in this phase), since it is placed after the loop.
+                        AssertOrFailFast(maskingBlock->upwardExposedUses);
+                        AssertOrFailFast(maskingBlock->upwardExposedFields);
+                        BVSparse<JitArenaAllocator> *symsToMask = JitAnew(alloc, BVSparse<JitArenaAllocator>, alloc);
+                        symsToMask->Or(maskingBlock->upwardExposedUses);
+                        symsToMask->Or(maskingBlock->upwardExposedFields);
+                        symsToMask->And(syms);
+                        // If nothing is exposed, we have nothing to mask, and nothing to do here.
+                        if (!symsToMask->IsEmpty())
+                        {
+                            if (bcuInstr->GetByteCodeUpwardExposedUsed() == nullptr)
+                            {
+                                // This will initialize the internal structure properly
+                                bcuInstr->SetBV(JitAnew(bcuInstr->m_func->m_alloc, BVSparse<JitArenaAllocator>, bcuInstr->m_func->m_alloc));
+                            }
+#if DBG_DUMP
+                            if (PHASE_TRACE(Js::SpeculationPropagationAnalysisPhase, loop->topFunc))
+                            {
+                                Output::Print(_u("Adding symbols to out-edge masking for loop %u outward block %u:\n"), loop->GetLoopNumber(), maskingBlock->GetBlockNum());
+                                symsToMask->Dump();
+                            }
+#endif
+                            // Add the syms to the mask set
+                            const_cast<BVSparse<JitArenaAllocator> *>(bcuInstr->GetByteCodeUpwardExposedUsed())->Or(symsToMask);
+                        }
+                    } NEXT_SLIST_ENTRY;
+                };
+                switch (instr->m_opcode)
+                {
+                case Js::OpCode::LdElemI_A:
+                case Js::OpCode::ProfiledLdElemI_A:
+                {
+                    IR::Opnd* dest = instr->GetDst();
+                    if (dest->IsRegOpnd())
+                    {
+                        SymID symid = dest->AsRegOpnd()->m_sym->m_id;
+                        if (!block->loop->internallyDereferencedSyms->Test(symid))
+                        {
+                            instr->SetIsSafeToSpeculate(true);
+                            addOutEdgeMasking(symid, block->loop, this->tempAlloc);
+#if DBG_DUMP
+                            if (PHASE_TRACE(Js::SpeculationPropagationAnalysisPhase, this->func))
+                            {
+                                Output::Print(_u("Marking instruction as safe:\n"));
+                                instr->highlight = 0x0f;
+                                instr->Dump();
+                            }
+#endif
+                        }
+                    }
+                    else if (dest->IsSymOpnd())
+                    {
+                        SymID symid = dest->AsSymOpnd()->m_sym->m_id;
+                        if (!block->loop->internallyDereferencedSyms->Test(symid))
+                        {
+                            instr->SetIsSafeToSpeculate(true);
+                            addOutEdgeMasking(symid, block->loop, this->tempAlloc);
+#if DBG_DUMP
+                            if (PHASE_TRACE(Js::SpeculationPropagationAnalysisPhase, this->func))
+                            {
+                                Output::Print(_u("Marking instruction as safe:\n"));
+                                instr->highlight = 0x0f;
+                                instr->Dump();
+                            }
+#endif
+                        }
+                    }
+                }
+                break;
+                default:
+                    // Most instructions don't have any particular handling needed here, as they don't
+                    // get any masking regardless.
+                    break;
+                }
+            }
+#endif
             switch(instr->m_opcode)
             {
                 case Js::OpCode::LdSlot:
@@ -2943,6 +3562,102 @@ BackwardPass::ProcessBlock(BasicBlock * block)
     }
     NEXT_INSTR_BACKWARD_IN_BLOCK_EDITING;
 
+#ifndef _M_ARM
+    if (this->tag == Js::DeadStorePhase
+        // We don't need the masking blocks in asmjs/wasm mode
+        && !block->GetFirstInstr()->m_func->GetJITFunctionBody()->IsAsmJsMode()
+        && !block->GetFirstInstr()->m_func->GetJITFunctionBody()->IsWasmFunction()
+        && !block->isDead
+        && !block->isDeleted)
+    {
+        FOREACH_PREDECESSOR_BLOCK(blockPred, block)
+        {
+            // Now we need to handle loop out-edges. These need blocks inserted to prevent load
+            // of those symbols in speculation; the easiest way to do this is to CMOV them with
+            // a flag that we always know will be false, as this introduces a dependency on the
+            // register that can't be speculated (currently).
+            //
+            // Note that we're doing this backwards - looking from the target into the loop. We
+            // do this because this way because we're going backwards over the blocks anyway; a
+            // block inserted after the branch may be impossible to correctly handle.
+            if (!blockPred->isDead && !blockPred->isDeleted && blockPred->loop != nullptr)
+            {
+                Loop* targetLoop = block->loop;
+                Loop* startingLoop = blockPred->loop;
+                bool addMaskingBlock = false;
+                if (targetLoop == nullptr)
+                {
+                    // If we're leaving to a non-looping context, we definitely want the masking block
+                    addMaskingBlock = true;
+                }
+                else if (targetLoop == startingLoop)
+                {
+                    // If we're still inside the same loop, we don't want a masking block
+                    addMaskingBlock = false;
+                }
+                else
+                {
+                    // We want a masking block if we're going to a loop enclosing the current one.
+                    Loop* loopTest = targetLoop;
+                    addMaskingBlock = true;
+                    while (loopTest != nullptr)
+                    {
+                        if (loopTest == startingLoop)
+                        {
+                            // the target loop is a child of the starting loop, so don't mask on the way
+                            addMaskingBlock = false;
+                            break;
+                        }
+                        loopTest = loopTest->parent;
+                    }
+                }
+                if (addMaskingBlock)
+                {
+                    // Avoid masking on the way from a masking block - we're already masking this jmp
+                    if (block->GetFirstInstr()->m_next->m_opcode == Js::OpCode::SpeculatedLoadFence)
+                    {
+                        addMaskingBlock = false;
+                    }
+                }
+                if (addMaskingBlock)
+                {
+                    // It's architecture dependent, so we just mark the block here and leave the actual
+                    // generation of the masking to the Lowerer.
+                    // Generated code here:
+                    // newTarget:
+                    // syms = targetedloadfence syms
+                    // jmp oldTarget
+
+                    // We need to increment the data use count since we're changing a successor.
+                    blockPred->IncrementDataUseCount();
+                    BasicBlock *newBlock = this->func->m_fg->InsertAirlockBlock(this->func->m_fg->FindEdge(blockPred, block));
+                    LABELNAMESET(newBlock->GetFirstInstr()->AsLabelInstr(), "Loop out-edge masking block");
+                    // This is a little bit of a misuse of ByteCodeUsesInstr - we're using it as just
+                    // a bitvector that we can add things to.
+                    IR::ByteCodeUsesInstr* masker = IR::ByteCodeUsesInstr::New(newBlock->GetFirstInstr());
+                    masker->m_opcode = Js::OpCode::SpeculatedLoadFence;
+                    // Add the one instruction we need to this block
+                    newBlock->GetFirstInstr()->InsertAfter(masker);
+                    // We need to initialize the data for this block, so that later stages of deadstore work properly.
+                    // Setting use count to 0 makes mergesucc create the structures
+                    newBlock->SetDataUseCount(0);
+                    // If we inserted an airlock block compensation block, we need to set the use count on that too.
+                    if (newBlock->prev && newBlock->prev->isAirLockCompensationBlock)
+                    {
+                        newBlock->prev->SetDataUseCount(0);
+                    }
+                    if (startingLoop->outwardSpeculationMaskInstrs == nullptr)
+                    {
+                        startingLoop->outwardSpeculationMaskInstrs = JitAnew(this->func->m_fg->alloc, SList<IR::ByteCodeUsesInstr*>, this->func->m_fg->alloc);
+                    }
+                    // We fill in the instruction later, so we need to add it to the loop's list of such instructions.
+                    startingLoop->outwardSpeculationMaskInstrs->Prepend(masker);
+                }
+            }
+        } NEXT_PREDECESSOR_BLOCK;
+    }
+#endif
+
     EndIntOverflowDoesNotMatterRange();
 
     if (!this->IsPrePass() && !block->isDead && block->isLoopHeader)
@@ -4866,7 +5581,7 @@ BackwardPass::InsertTypeTransition(IR::Instr *instrInsertBefore, StackSym *objSy
         int newCount;
         Js::PropertyIndex inlineSlotCapacity;
         Js::PropertyIndex newInlineSlotCapacity;
-        bool needSlotAdjustment = 
+        bool needSlotAdjustment =
             JITTypeHandler::NeedSlotAdjustment(initialType->GetTypeHandler(), finalType->GetTypeHandler(), &oldCount, &newCount, &inlineSlotCapacity, &newInlineSlotCapacity);
         if (needSlotAdjustment)
         {

+ 8 - 1
lib/Backend/BackwardPass.h

@@ -178,9 +178,16 @@ private:
     BVSparse<JitArenaAllocator> * intOverflowDoesNotMatterInRangeBySymId;
     BVSparse<JitArenaAllocator> * candidateSymsRequiredToBeInt;
     BVSparse<JitArenaAllocator> * candidateSymsRequiredToBeLossyInt;
-    StackSym *considerSymAsRealUseInNoImplicitCallUses;
+    StackSym * considerSymAsRealUseInNoImplicitCallUses;
     bool intOverflowCurrentlyMattersInRange;
     bool isCollectionPass;
+    enum class CollectionPassSubPhase
+    {
+        None,
+        FirstPass,
+        SecondPass
+    } collectionPassSubPhase;
+    bool isLoopPrepass;
 
     class FloatSymEquivalenceClass
     {

+ 37 - 6
lib/Backend/FlowGraph.cpp

@@ -561,11 +561,11 @@ FlowGraph::Build(void)
 
                             // Add edge to finally block, leave block
                             this->AddEdge(currentBlock, this->finallyLabelStack->Top()->GetBasicBlock());
-                            this->AddEdge(currentBlock, leaveBlock);                            
+                            this->AddEdge(currentBlock, leaveBlock);
                         }
                     }
                 }
-            }            
+            }
             else if (instr->m_opcode == Js::OpCode::Finally)
             {
                 AssertOrFailFast(!this->finallyLabelStack->Empty());
@@ -581,7 +581,7 @@ FlowGraph::Build(void)
             block->SetBlockNum(blockNum++);
         } NEXT_BLOCK_ALL;
     }
-    
+
     this->FindLoops();
 
 #if DBG_DUMP
@@ -1739,6 +1739,14 @@ FlowGraph::Destroy(void)
             Region * predRegion = nullptr;
             FOREACH_PREDECESSOR_BLOCK(predBlock, block)
             {
+                BasicBlock* intermediateBlock = block;
+                // Skip blocks inserted for airlock/masking purposes
+                while ((predBlock->isAirLockBlock || predBlock->isAirLockCompensationBlock) && predBlock->GetFirstInstr()->AsLabelInstr()->GetRegion() == region)
+                {
+                    Assert(predBlock->GetPredList()->HasOne());
+                    intermediateBlock = predBlock;
+                    predBlock = predBlock->GetPredList()->Head()->GetPred();
+                }
                 predRegion = predBlock->GetFirstInstr()->AsLabelInstr()->GetRegion();
                 if (predBlock->GetLastInstr() == nullptr)
                 {
@@ -1751,7 +1759,7 @@ FlowGraph::Destroy(void)
                     case Js::OpCode::TryCatch:
                     case Js::OpCode::TryFinally:
                         AssertMsg(region->GetParent() == predRegion, "Bad region prop on entry to try-catch/finally");
-                        if (block->GetFirstInstr() == predBlock->GetLastInstr()->AsBranchInstr()->GetTarget())
+                        if (intermediateBlock->GetFirstInstr() == predBlock->GetLastInstr()->AsBranchInstr()->GetTarget())
                         {
                             if (predBlock->GetLastInstr()->m_opcode == Js::OpCode::TryCatch)
                             {
@@ -2069,7 +2077,7 @@ FlowGraph::UpdateRegionForBlockFromEHPred(BasicBlock * block, bool reassign)
     Assert(region || block->GetPredList()->Count() == 0 || block->firstInstr->AsLabelInstr()->GetRegion());
 
     if (region)
-    { 
+    {
         if (!region->ehBailoutData)
         {
             region->AllocateEHBailoutData(this->func, tryInstr);
@@ -2253,15 +2261,36 @@ FlowGraph::InsertAirlockBlock(FlowEdge * edge)
     BasicBlock * sourceBlock = edge->GetPred();
     BasicBlock * sinkBlock = edge->GetSucc();
 
+    IR::Instr * sourceLastInstr = sourceBlock->GetLastInstr();
+
+    //
+    // Normalize block
+    //
+    if(!sourceLastInstr->IsBranchInstr())
+    {
+        // There are some cases where the last instruction of a block can be not a branch;
+        // for example, if it was previously a conditional branch that was impossible to take.
+        // In these situations, we can insert an unconditional branch to fallthrough for that
+        // block, to renormalize it.
+        SListBaseCounted<FlowEdge*>* successors = sourceBlock->GetSuccList();
+        // Only handling the case for one arc left at the moment; other cases are likely bugs.
+        AssertOrFailFastMsg(successors->HasOne(), "Failed to normalize weird block before airlock");
+        FlowEdge* onlyLink = successors->Head();
+        AssertOrFailFastMsg(onlyLink == edge, "Found duplicate of edge?");
+        AssertOrFailFastMsg(onlyLink->GetSucc() == sinkBlock, "State inconsistent");
+        sourceLastInstr->InsertAfter(IR::BranchInstr::New(Js::OpCode::Br, onlyLink->GetSucc()->GetFirstInstr()->AsLabelInstr(), sourceLastInstr->m_func));
+        sourceLastInstr = sourceLastInstr->m_next;
+    }
+
     BasicBlock * sinkPrevBlock = sinkBlock->prev;
     IR::Instr *  sinkPrevBlockLastInstr = sinkPrevBlock->GetLastInstr();
-    IR::Instr * sourceLastInstr = sourceBlock->GetLastInstr();
 
     airlockBlock->loop = sinkBlock->loop;
     airlockBlock->SetBlockNum(this->blockCount++);
 #ifdef DBG
     airlockBlock->isAirLockBlock = true;
 #endif
+
     //
     // Fixup block linkage
     //
@@ -2312,6 +2341,7 @@ FlowGraph::InsertAirlockBlock(FlowEdge * edge)
     airlockBlock->SetLastInstr(airlockBr);
 
     airlockLabel->SetByteCodeOffset(sinkLabel);
+    airlockLabel->SetRegion(sinkLabel->GetRegion());
 
     // Fixup flow out of sourceBlock
     IR::BranchInstr *sourceBr = sourceLastInstr->AsBranchInstr();
@@ -2433,6 +2463,7 @@ FlowGraph::InsertCompensationCodeForBlockMove(FlowEdge * edge,  bool insertToLoo
     compBlock->SetLastInstr(compBr);
 
     compLabel->SetByteCodeOffset(sinkLabel);
+    compLabel->SetRegion(sinkLabel->GetRegion());
 
     // Fixup flow out of sourceBlock
     if (sourceLastInstr->IsBranchInstr())

+ 7 - 0
lib/Backend/FlowGraph.h

@@ -603,6 +603,10 @@ public:
     BasicBlock *dominatingLoopCountableBlock;
     LoopCount *loopCount;
     SymIdToStackSymMap *loopCountBasedBoundBaseSyms;
+    typedef SegmentClusterList<SymID, JitArenaAllocator> LoopSymClusterList;
+    LoopSymClusterList *symClusterList;
+    BVSparse<JitArenaAllocator> * internallyDereferencedSyms;
+    SList<IR::ByteCodeUsesInstr*> *outwardSpeculationMaskInstrs;
 
     bool                isDead : 1;
     bool                hasDeadStoreCollectionPass : 1;
@@ -729,6 +733,9 @@ public:
         dominatingLoopCountableBlock(nullptr),
         loopCount(nullptr),
         loopCountBasedBoundBaseSyms(nullptr),
+        symClusterList(nullptr),
+        internallyDereferencedSyms(nullptr),
+        outwardSpeculationMaskInstrs(nullptr),
         isDead(false),
         allFieldsKilled(false),
         isLeaf(true),

+ 1 - 1
lib/Backend/GlobOpt.h

@@ -932,7 +932,7 @@ private:
     bool                    CheckIfInstrInTypeCheckSeqEmitsTypeCheck(IR::Instr* instr, IR::PropertySymOpnd *opnd);
     template<bool makeChanges>
     bool                    ProcessPropOpInTypeCheckSeq(IR::Instr* instr, IR::PropertySymOpnd *opnd, BasicBlock* block, bool updateExistingValue, bool* emitsTypeCheckOut = nullptr, bool* changesTypeValueOut = nullptr, bool *isObjTypeChecked = nullptr);
-    void                    KillObjectHeaderInlinedTypeSyms(BasicBlock *block, bool isObjTypeSpecialized, SymID symId = (SymID)-1);
+    void                    KillObjectHeaderInlinedTypeSyms(BasicBlock *block, bool isObjTypeSpecialized, SymID symId = SymID_Invalid);
     void                    ValueNumberObjectType(IR::Opnd *dstOpnd, IR::Instr *instr);
     void                    SetSingleTypeOnObjectTypeValue(Value* value, const JITTypeHolder type);
     void                    SetTypeSetOnObjectTypeValue(Value* value, Js::EquivalentTypeSet* typeSet);

+ 16 - 7
lib/Backend/IR.cpp

@@ -4532,8 +4532,16 @@ Instr::Dump(IRDumpFlags flags)
             }
         }
 
-        Output::SkipToColumn(20);
-        Output::Print(_u("="));
+        if (this->isSafeToSpeculate)
+        {
+            Output::SkipToColumn(19);
+            Output::Print(_u("<=="));
+        }
+        else
+        {
+            Output::SkipToColumn(20);
+            Output::Print(_u("="));
+        }
     }
 
     PrintOpCodeName();
@@ -4610,21 +4618,22 @@ Instr::Dump(IRDumpFlags flags)
         }
     }
 
-    if (this->IsByteCodeUsesInstr())
+    if (this->IsByteCodeUsesInstr() || this->m_opcode == Js::OpCode::SpeculatedLoadFence)
     {
-        if (this->AsByteCodeUsesInstr()->GetByteCodeUpwardExposedUsed())
+        ByteCodeUsesInstr* tempbcu = static_cast<ByteCodeUsesInstr*>(this);
+        if (tempbcu->GetByteCodeUpwardExposedUsed())
         {
             bool first = true;
-            FOREACH_BITSET_IN_SPARSEBV(id, this->AsByteCodeUsesInstr()->GetByteCodeUpwardExposedUsed())
+            FOREACH_BITSET_IN_SPARSEBV(id, tempbcu->GetByteCodeUpwardExposedUsed())
             {
                 Output::Print(first? _u("s%d") : _u(", s%d"), id);
                 first = false;
             }
             NEXT_BITSET_IN_SPARSEBV;
         }
-        if (this->AsByteCodeUsesInstr()->propertySymUse)
+        if (tempbcu->propertySymUse)
         {
-            Output::Print(_u("  PropSym: %d"), this->AsByteCodeUsesInstr()->propertySymUse->m_id);
+            Output::Print(_u("  PropSym: %d"), tempbcu->propertySymUse->m_id);
         }
     }
 

+ 12 - 8
lib/Backend/IR.h

@@ -168,7 +168,8 @@ protected:
         isCtorCall(false),
         isCallInstrProtectedByNoProfileBailout(false),
         hasSideEffects(false),
-        isNonFastPathFrameDisplay(false)
+        isNonFastPathFrameDisplay(false),
+        isSafeToSpeculate(false)
 #if DBG
         , highlight(0)
 #endif
@@ -219,6 +220,10 @@ public:
 
     bool            IsCloned() const { return isCloned; }
     void            SetIsCloned(bool isCloned) { this->isCloned = isCloned; }
+
+    bool            IsSafeToSpeculate() const { return isSafeToSpeculate; }
+    void            SetIsSafeToSpeculate(bool isSafe) { this->isSafeToSpeculate = isSafe; }
+
     bool            HasBailOutInfo() const { return hasBailOutInfo; }
     bool            HasAuxBailOut() const { return hasAuxBailOut; }
     bool            HasTypeCheckBailOut() const;
@@ -503,6 +508,9 @@ public:
     // used only for SIMD Ld/St from typed arrays.
     // we keep these here to avoid increase in number of opcodes and to not use ExtendedArgs
     uint8           dataWidth;
+#if DBG
+    WORD            highlight;
+#endif
 
 
     bool            isFsBased : 1; // TEMP : just for BS testing
@@ -526,8 +534,9 @@ public:
     bool            hasSideEffects : 1; // The instruction cannot be dead stored
     bool            isNonFastPathFrameDisplay : 1;
 protected:
-    bool            isCloned:1;
-    bool            hasBailOutInfo:1;
+    bool            isCloned : 1;
+    bool            hasBailOutInfo : 1;
+    bool            isSafeToSpeculate : 1;
 
     // Used for aux bail out. We are using same bailOutInfo, just different boolean to hide regular bail out.
     // Refer to ConvertToBailOutInstr implementation for details.
@@ -538,11 +547,6 @@ protected:
     Opnd *          m_dst;
     Opnd *          m_src1;
     Opnd *          m_src2;
-#if DBG
-    WORD            highlight;
-#endif
-
-
 
     void Init(Js::OpCode opcode, IRKind kind, Func * func);
     IR::Instr *     CloneInstr() const;

+ 1 - 1
lib/Backend/IR.inl

@@ -204,7 +204,7 @@ Instr::AsProfiledLabelInstr()
 inline bool
 Instr::IsByteCodeUsesInstr() const
 {
-    return GetKind() == IR::InstrKindByteCodeUses;
+    return GetKind() == IR::InstrKindByteCodeUses && m_opcode == Js::OpCode::ByteCodeUses;
 }
 
 inline ByteCodeUsesInstr *

+ 1 - 1
lib/Backend/LinearScan.cpp

@@ -2114,7 +2114,7 @@ void LinearScan::RecordLoopUse(Lifetime *lifetime, RegNum reg)
     // We are trying to avoid the need for compensation at the bottom of the loop if
     // the reg ends up being spilled before it is actually used.
     Loop *curLoop = this->curLoop;
-    SymID symId = (SymID)-1;
+    SymID symId = SymID_Invalid;
 
     if (lifetime)
     {

+ 213 - 176
lib/Backend/Lower.cpp

@@ -563,7 +563,7 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             {
                 m_lowererMD.GenerateFastScopedStFld(instr);
             }
-            Js::PropertyOperationFlags flags = static_cast<Js::PropertyOperationFlags>((instr->m_opcode == Js::OpCode::ConsoleScopedStFld ?  Js::PropertyOperation_None : Js::PropertyOperation_StrictMode) | Js::PropertyOperation_AllowUndeclInConsoleScope);
+            Js::PropertyOperationFlags flags = static_cast<Js::PropertyOperationFlags>((instr->m_opcode == Js::OpCode::ConsoleScopedStFld ? Js::PropertyOperation_None : Js::PropertyOperation_StrictMode) | Js::PropertyOperation_AllowUndeclInConsoleScope);
             instrPrev = this->LowerScopedStFld(instr, IR::HelperOp_ConsolePatchSetPropertyScoped, true, true, flags);
             break;
         }
@@ -1081,10 +1081,10 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
                 break;
             }
 #endif
-            if(instr->HasBailOutInfo())
+            if (instr->HasBailOutInfo())
             {
                 const auto bailOutKind = instr->GetBailOutKind();
-                if(bailOutKind & IR::BailOutOnResultConditions ||
+                if (bailOutKind & IR::BailOutOnResultConditions ||
                     bailOutKind == IR::BailOutOnFailedHoistedLoopCountBasedBoundCheck)
                 {
                     const auto nonBailOutInstr = SplitBailOnResultCondition(instr);
@@ -1092,7 +1092,7 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
                     LowerBailOnResultCondition(instr, &bailOutLabel, &skipBailOutLabel);
                     LowerInstrWithBailOnResultCondition(nonBailOutInstr, bailOutKind, bailOutLabel, skipBailOutLabel);
                 }
-                else if(bailOutKind == IR::BailOnModByPowerOf2)
+                else if (bailOutKind == IR::BailOnModByPowerOf2)
                 {
                     Assert(instr->m_opcode == Js::OpCode::Rem_I4);
                     bool fastPath = GenerateSimplifiedInt4Rem(instr);
@@ -1154,12 +1154,12 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
         case Js::OpCode::LdLen_A:
         {
             bool fastPath = !noMathFastPath;
-            if(!fastPath && instr->HasBailOutInfo())
+            if (!fastPath && instr->HasBailOutInfo())
             {
                 // Some bailouts are generated around the helper call, and will work even if the fast path is disabled. Other
                 // bailouts require the fast path.
                 const IR::BailOutKind bailOutKind = instr->GetBailOutKind();
-                if(bailOutKind & IR::BailOutKindBits)
+                if (bailOutKind & IR::BailOutKindBits)
                 {
                     fastPath = true;
                 }
@@ -1174,11 +1174,11 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             }
 
             bool instrIsInHelperBlock = false;
-            if(!fastPath)
+            if (!fastPath)
             {
                 LowerLdLen(instr, false);
             }
-            else if(GenerateFastLdLen(instr, &instrIsInHelperBlock))
+            else if (GenerateFastLdLen(instr, &instrIsInHelperBlock))
             {
                 Assert(
                     !instr->HasBailOutInfo() ||
@@ -1236,8 +1236,8 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             }
             else
             {
-                 this->GenerateLdThisStrict(instr);
-                 instr->Remove();
+                this->GenerateLdThisStrict(instr);
+                instr->Remove();
             }
             break;
 
@@ -1586,13 +1586,13 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             // Note: under debugger (Fast F12) don't let GenerateFastStElemI which calls into ToNumber_Helper
             //       which takes double, and currently our helper wrapper doesn't support double.
             bool fastPath = !noMathFastPath && !m_func->IsJitInDebugMode();
-            if(!fastPath && instr->HasBailOutInfo())
+            if (!fastPath && instr->HasBailOutInfo())
             {
                 // Some bailouts are generated around the helper call, and will work even if the fast path is disabled. Other
                 // bailouts require the fast path.
                 const IR::BailOutKind bailOutKind = instr->GetBailOutKind();
                 const IR::BailOutKind bailOutKindBits = bailOutKind & IR::BailOutKindBits;
-                if(bailOutKindBits & ~(IR::BailOutOnMissingValue | IR::BailOutConvertedNativeArray))
+                if (bailOutKindBits & ~(IR::BailOutOnMissingValue | IR::BailOutConvertedNativeArray))
                 {
                     fastPath = true;
                 }
@@ -1625,7 +1625,7 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             else if (GenerateFastStElemI(instr, &instrIsInHelperBlock))
             {
 #if DBG
-                if(instr->HasBailOutInfo())
+                if (instr->HasBailOutInfo())
                 {
                     const IR::BailOutKind bailOutKind = instr->GetBailOutKind();
 
@@ -1634,7 +1634,7 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
                         !(
                             bailOutKind &
                             (IR::BailOutConventionalNativeArrayAccessOnly | IR::BailOutOnArrayAccessHelperCall)
-                        ));
+                            ));
                 }
 #endif
                 this->LowerStElemI(
@@ -1653,13 +1653,13 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
                 (
                     instr->m_opcode != Js::OpCode::LdMethodElem ||
                     instr->GetSrc1()->AsIndirOpnd()->GetBaseOpnd()->GetValueType().IsLikelyObject()
-                );
-            if(!fastPath && instr->HasBailOutInfo())
+                    );
+            if (!fastPath && instr->HasBailOutInfo())
             {
                 // Some bailouts are generated around the helper call, and will work even if the fast path is disabled. Other
                 // bailouts require the fast path.
                 const IR::BailOutKind bailOutKind = instr->GetBailOutKind();
-                if(bailOutKind & IR::BailOutKindBits)
+                if (bailOutKind & IR::BailOutKindBits)
                 {
                     fastPath = true;
                 }
@@ -1693,7 +1693,7 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             else if (GenerateFastLdElemI(instr, &instrIsInHelperBlock))
             {
 #if DBG
-                if(instr->HasBailOutInfo())
+                if (instr->HasBailOutInfo())
                 {
                     const IR::BailOutKind bailOutKind = instr->GetBailOutKind();
 
@@ -1702,7 +1702,7 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
                         !(
                             bailOutKind &
                             (IR::BailOutConventionalNativeArrayAccessOnly | IR::BailOutOnArrayAccessHelperCall)
-                        ));
+                            ));
                 }
 #endif
                 this->LowerLdElemI(
@@ -1789,11 +1789,11 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
                 break;
             }
             m_lowererMD.ChangeToAssign(instr);
-            if(instr->HasBailOutInfo())
+            if (instr->HasBailOutInfo())
             {
                 IR::BailOutKind bailOutKind = instr->GetBailOutKind();
 
-                if(bailOutKind == IR::BailOutExpectingString)
+                if (bailOutKind == IR::BailOutExpectingString)
                 {
                     this->LowerBailOnNotString(instr);
                 }
@@ -1818,7 +1818,7 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             Assert(instr->GetSrc1()->GetType() == TyVar);
             if (instr->GetDst()->GetType() == TyInt32)
             {
-                if(m_lowererMD.EmitLoadInt32(instr, !(instr->HasBailOutInfo() && (instr->GetBailOutKind() == IR::BailOutOnNotPrimitive))))
+                if (m_lowererMD.EmitLoadInt32(instr, !(instr->HasBailOutInfo() && (instr->GetBailOutKind() == IR::BailOutOnNotPrimitive))))
                 {
                     // Bail out instead of calling a helper
                     Assert(instr->GetBailOutKind() == IR::BailOutIntOnly || instr->GetBailOutKind() == IR::BailOutExpectingInteger);
@@ -1852,7 +1852,7 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             {
                 Assert(m_func->GetJITFunctionBody()->IsWasmFunction());
                 GenerateRuntimeError(instr, WASMERR_InvalidTypeConversion);
-                instr->ReplaceSrc1(IR::Simd128ConstOpnd::New({0,0,0,0}, instr->GetDst()->GetType(), m_func));
+                instr->ReplaceSrc1(IR::Simd128ConstOpnd::New({ 0,0,0,0 }, instr->GetDst()->GetType(), m_func));
                 LowererMD::ChangeToAssign(instr);
             }
 #endif
@@ -1869,18 +1869,18 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
 
         case Js::OpCode::ArgOut_A_Inline:
         case Js::OpCode::ArgOut_A_Dynamic:
-            {
-                // ArgOut/StartCall are normally lowered by the lowering of the associated call instr.
-                // If the call becomes unreachable, we could end up with an orphan ArgOut or StartCall.
-                // Change the ArgOut into a store to the stack for bailouts
-                instr->FreeSrc2();
-                StackSym *argSym = instr->GetDst()->AsSymOpnd()->m_sym->AsStackSym();
-                argSym->m_offset = this->m_func->StackAllocate(sizeof(Js::Var));
-                argSym->m_allocated = true;
-                argSym->m_isOrphanedArg = true;
-                this->m_lowererMD.ChangeToAssign(instr);
-            }
-            break;
+        {
+            // ArgOut/StartCall are normally lowered by the lowering of the associated call instr.
+            // If the call becomes unreachable, we could end up with an orphan ArgOut or StartCall.
+            // Change the ArgOut into a store to the stack for bailouts
+            instr->FreeSrc2();
+            StackSym *argSym = instr->GetDst()->AsSymOpnd()->m_sym->AsStackSym();
+            argSym->m_offset = this->m_func->StackAllocate(sizeof(Js::Var));
+            argSym->m_allocated = true;
+            argSym->m_isOrphanedArg = true;
+            this->m_lowererMD.ChangeToAssign(instr);
+        }
+        break;
         case Js::OpCode::LoweredStartCall:
         case Js::OpCode::StartCall:
             // ArgOut/StartCall are normally lowered by the lowering of the associated call instr.
@@ -2040,12 +2040,12 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             break;
 
         case Js::OpCode::BrFncEqApply:
-          LowerBrFncApply(instr,IR::HelperOp_OP_BrFncEqApply);
-          break;
+            LowerBrFncApply(instr, IR::HelperOp_OP_BrFncEqApply);
+            break;
 
         case Js::OpCode::BrFncNeqApply:
-          LowerBrFncApply(instr,IR::HelperOp_OP_BrFncNeqApply);
-          break;
+            LowerBrFncApply(instr, IR::HelperOp_OP_BrFncNeqApply);
+            break;
 
         case Js::OpCode::BrHasSideEffects:
         case Js::OpCode::BrNotHasSideEffects:
@@ -2236,147 +2236,147 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             break;
 
         case Js::OpCode::ProfiledLoopStart:
+        {
+            Assert(m_func->DoSimpleJitDynamicProfile());
+            Assert(instr->IsJitProfilingInstr());
+
+            // Check for the helper instr from IRBuilding (it won't be there if there are no LoopEnds due to an infinite loop)
+            auto prev = instr->m_prev;
+            if (prev->IsJitProfilingInstr() && prev->AsJitProfilingInstr()->isLoopHelper)
             {
-                Assert(m_func->DoSimpleJitDynamicProfile());
-                Assert(instr->IsJitProfilingInstr());
+                auto saveOpnd = prev->UnlinkDst();
+                instrPrev = prev->m_prev;
+                prev->Remove();
 
-                // Check for the helper instr from IRBuilding (it won't be there if there are no LoopEnds due to an infinite loop)
-                auto prev = instr->m_prev;
-                if (prev->IsJitProfilingInstr() && prev->AsJitProfilingInstr()->isLoopHelper)
+                const auto starFlag = GetImplicitCallFlagsOpnd();
+                IR::AutoReuseOpnd a(starFlag, m_func);
+                this->InsertMove(saveOpnd, starFlag, instr);
+                this->InsertMove(starFlag, CreateClearImplicitCallFlagsOpnd(), instr);
+            }
+            else
+            {
+#if DBG
+                // Double check that we indeed do not have a LoopEnd that is part of the same loop for the rest of the function
+                auto cur = instr;
+                auto loopNumber = instr->AsJitProfilingInstr()->loopNumber;
+                while (cur)
                 {
-                    auto saveOpnd = prev->UnlinkDst();
-                    instrPrev = prev->m_prev;
-                    prev->Remove();
-
-                    const auto starFlag = GetImplicitCallFlagsOpnd();
-                    IR::AutoReuseOpnd a(starFlag, m_func);
-                    this->InsertMove(saveOpnd, starFlag, instr);
-                    this->InsertMove(starFlag, CreateClearImplicitCallFlagsOpnd(), instr);
+                    Assert(cur->m_opcode != Js::OpCode::ProfiledLoopEnd || cur->IsJitProfilingInstr() && cur->AsJitProfilingInstr()->loopNumber != loopNumber);
+                    cur = cur->m_next;
                 }
-                else
-                {
-#if DBG
-                    // Double check that we indeed do not have a LoopEnd that is part of the same loop for the rest of the function
-                    auto cur = instr;
-                    auto loopNumber = instr->AsJitProfilingInstr()->loopNumber;
-                    while (cur)
-                    {
-                        Assert(cur->m_opcode != Js::OpCode::ProfiledLoopEnd || cur->IsJitProfilingInstr() && cur->AsJitProfilingInstr()->loopNumber != loopNumber);
-                        cur = cur->m_next;
-                    }
 #endif
-                }
+            }
 
-                // If we turned off fulljit, there's no reason to do this.
-                if (PHASE_OFF(Js::FullJitPhase, m_func))
-                {
-                    instr->Remove();
-                }
-                else
-                {
-                    Assert(instr->GetDst());
-                    instr->SetSrc1(IR::HelperCallOpnd::New(IR::HelperSimpleGetScheduledEntryPoint, m_func));
-                    m_lowererMD.LoadHelperArgument(instr, IR::Opnd::CreateUint32Opnd(instr->AsJitProfilingInstr()->loopNumber, m_func));
-                    m_lowererMD.LoadHelperArgument(instr, IR::Opnd::CreateFramePointerOpnd(m_func));
-                    this->m_lowererMD.LowerCall(instr, 0);
-                }
-                break;
+            // If we turned off fulljit, there's no reason to do this.
+            if (PHASE_OFF(Js::FullJitPhase, m_func))
+            {
+                instr->Remove();
             }
+            else
+            {
+                Assert(instr->GetDst());
+                instr->SetSrc1(IR::HelperCallOpnd::New(IR::HelperSimpleGetScheduledEntryPoint, m_func));
+                m_lowererMD.LoadHelperArgument(instr, IR::Opnd::CreateUint32Opnd(instr->AsJitProfilingInstr()->loopNumber, m_func));
+                m_lowererMD.LoadHelperArgument(instr, IR::Opnd::CreateFramePointerOpnd(m_func));
+                this->m_lowererMD.LowerCall(instr, 0);
+            }
+            break;
+        }
         case Js::OpCode::ProfiledLoopBodyStart:
+        {
+            Assert(m_func->DoSimpleJitDynamicProfile());
+
+            const auto loopNum = instr->AsJitProfilingInstr()->loopNumber;
+            Assert(loopNum < m_func->GetJITFunctionBody()->GetLoopCount());
+
+            auto entryPointOpnd = instr->UnlinkSrc1();
+            auto dobailout = instr->UnlinkDst();
+            const auto dobailoutType = TyUint8;
+            Assert(dobailout->GetType() == TyUint8 && sizeof(decltype(Js::SimpleJitHelpers::IsLoopCodeGenDone(nullptr))) == 1);
+
+            m_lowererMD.LoadHelperArgument(instr, IR::IntConstOpnd::New(0, TyUint32, m_func)); // zero indicates that we do not want to add flags back in
+            m_lowererMD.LoadHelperArgument(instr, IR::IntConstOpnd::New(loopNum, TyUint32, m_func));
+            m_lowererMD.LoadHelperArgument(instr, IR::Opnd::CreateFramePointerOpnd(m_func));
+            instr->SetSrc1(IR::HelperCallOpnd::New(IR::HelperSimpleRecordLoopImplicitCallFlags, m_func));
+            m_lowererMD.LowerCall(instr, 0);
+
+            // Outline of JITed code:
+            //
+            // LoopStart:
+            //   entryPoint = GetScheduledEntryPoint(framePtr, loopNum)
+            // LoopBodyStart:
+            //   uint8 dobailout;
+            //   if (entryPoint) {
+            //     dobailout = IsLoopCodeGenDone(entryPoint)
+            //   } else {
+            //     dobailout = ++interpretCount >= threshold
+            //   }
+            //   // already exists from IRBuilding:
+            //   if (dobailout) {
+            //       Bailout
+            //   }
+
+            if (PHASE_OFF(Js::FullJitPhase, m_func) || !m_func->GetJITFunctionBody()->DoJITLoopBody())
+            {
+                // If we're not doing fulljit, we've turned off JitLoopBodies, or if we don't have loop headers allocated (the function has a Try,  etc)
+                //      just move false to dobailout
+                this->InsertMove(dobailout, IR::IntConstOpnd::New(0, dobailoutType, m_func, true), instr->m_next);
+            }
+            else if (m_func->GetWorkItem()->GetJITTimeInfo()->ForceJITLoopBody())
+            {
+                // If we're forcing jit loop bodies, move true to dobailout
+                this->InsertMove(dobailout, IR::IntConstOpnd::New(1, dobailoutType, m_func, true), instr->m_next);
+            }
+            else
             {
-                Assert(m_func->DoSimpleJitDynamicProfile());
+                // Put in the labels
+                auto entryPointIsNull = IR::LabelInstr::New(Js::OpCode::Label, m_func);
+                auto checkDoBailout = IR::LabelInstr::New(Js::OpCode::Label, m_func);
+                instr->InsertAfter(checkDoBailout);
+                instr->InsertAfter(entryPointIsNull);
 
-                const auto loopNum = instr->AsJitProfilingInstr()->loopNumber;
-                Assert(loopNum < m_func->GetJITFunctionBody()->GetLoopCount());
+                this->InsertCompareBranch(entryPointOpnd, IR::AddrOpnd::New(nullptr, IR::AddrOpndKindDynamicMisc, m_func), Js::OpCode::BrEq_A, false, entryPointIsNull, instr->m_next);
 
-                auto entryPointOpnd = instr->UnlinkSrc1();
-                auto dobailout = instr->UnlinkDst();
-                const auto dobailoutType = TyUint8;
-                Assert(dobailout->GetType() == TyUint8 && sizeof(decltype(Js::SimpleJitHelpers::IsLoopCodeGenDone(nullptr))) == 1);
+                // If the entry point is not null
+                auto isCodeGenDone = IR::Instr::New(Js::OpCode::Call, dobailout, IR::HelperCallOpnd::New(IR::HelperSimpleIsLoopCodeGenDone, m_func), m_func);
+                entryPointIsNull->InsertBefore(isCodeGenDone);
+                m_lowererMD.LoadHelperArgument(isCodeGenDone, entryPointOpnd);
+                m_lowererMD.LowerCall(isCodeGenDone, 0);
+                this->InsertBranch(LowererMD::MDUncondBranchOpcode, true, checkDoBailout, entryPointIsNull);
 
-                m_lowererMD.LoadHelperArgument(instr, IR::IntConstOpnd::New(0, TyUint32, m_func)); // zero indicates that we do not want to add flags back in
-                m_lowererMD.LoadHelperArgument(instr, IR::IntConstOpnd::New(loopNum, TyUint32, m_func));
-                m_lowererMD.LoadHelperArgument(instr, IR::Opnd::CreateFramePointerOpnd(m_func));
-                instr->SetSrc1(IR::HelperCallOpnd::New(IR::HelperSimpleRecordLoopImplicitCallFlags, m_func));
-                m_lowererMD.LowerCall(instr, 0);
+                const auto type = TyUint32;
+                auto countReg = IR::RegOpnd::New(type, m_func);
+                auto countAddr = IR::MemRefOpnd::New(m_func->GetJITFunctionBody()->GetLoopHeaderAddr(loopNum) + Js::LoopHeader::GetOffsetOfInterpretCount(), type, m_func);
+                IR::AutoReuseOpnd a(countReg, m_func), b(countAddr, m_func);
+                this->InsertAdd(false, countReg, countAddr, IR::IntConstOpnd::New(1, type, m_func, true), checkDoBailout);
+                this->InsertMove(countAddr, countReg, checkDoBailout);
 
-                // Outline of JITed code:
-                //
-                // LoopStart:
-                //   entryPoint = GetScheduledEntryPoint(framePtr, loopNum)
-                // LoopBodyStart:
-                //   uint8 dobailout;
-                //   if (entryPoint) {
-                //     dobailout = IsLoopCodeGenDone(entryPoint)
-                //   } else {
-                //     dobailout = ++interpretCount >= threshold
-                //   }
-                //   // already exists from IRBuilding:
-                //   if (dobailout) {
-                //       Bailout
-                //   }
-
-                if (PHASE_OFF(Js::FullJitPhase, m_func) || !m_func->GetJITFunctionBody()->DoJITLoopBody())
-                {
-                    // If we're not doing fulljit, we've turned off JitLoopBodies, or if we don't have loop headers allocated (the function has a Try,  etc)
-                    //      just move false to dobailout
-                    this->InsertMove(dobailout, IR::IntConstOpnd::New(0, dobailoutType, m_func, true), instr->m_next);
-                }
-                else if (m_func->GetWorkItem()->GetJITTimeInfo()->ForceJITLoopBody())
-                {
-                    // If we're forcing jit loop bodies, move true to dobailout
-                    this->InsertMove(dobailout, IR::IntConstOpnd::New(1, dobailoutType, m_func, true), instr->m_next);
-                }
-                else
-                {
-                    // Put in the labels
-                    auto entryPointIsNull = IR::LabelInstr::New(Js::OpCode::Label, m_func);
-                    auto checkDoBailout = IR::LabelInstr::New(Js::OpCode::Label, m_func);
-                    instr->InsertAfter(checkDoBailout);
-                    instr->InsertAfter(entryPointIsNull);
-
-                    this->InsertCompareBranch(entryPointOpnd, IR::AddrOpnd::New(nullptr, IR::AddrOpndKindDynamicMisc, m_func), Js::OpCode::BrEq_A, false, entryPointIsNull, instr->m_next);
-
-                    // If the entry point is not null
-                    auto isCodeGenDone = IR::Instr::New(Js::OpCode::Call, dobailout, IR::HelperCallOpnd::New(IR::HelperSimpleIsLoopCodeGenDone, m_func), m_func);
-                    entryPointIsNull->InsertBefore(isCodeGenDone);
-                    m_lowererMD.LoadHelperArgument(isCodeGenDone, entryPointOpnd);
-                    m_lowererMD.LowerCall(isCodeGenDone, 0);
-                    this->InsertBranch(LowererMD::MDUncondBranchOpcode, true, checkDoBailout, entryPointIsNull);
-
-                    const auto type = TyUint32;
-                    auto countReg = IR::RegOpnd::New(type, m_func);
-                    auto countAddr = IR::MemRefOpnd::New(m_func->GetJITFunctionBody()->GetLoopHeaderAddr(loopNum) + Js::LoopHeader::GetOffsetOfInterpretCount(), type, m_func);
-                    IR::AutoReuseOpnd a(countReg, m_func), b(countAddr, m_func);
-                    this->InsertAdd(false, countReg, countAddr, IR::IntConstOpnd::New(1, type, m_func, true), checkDoBailout);
-                    this->InsertMove(countAddr, countReg, checkDoBailout);
-
-                    this->InsertMove(dobailout, IR::IntConstOpnd::New(0, dobailoutType, m_func, true), checkDoBailout);
-
-                    this->InsertCompareBranch(countReg, IR::IntConstOpnd::New(m_func->GetJITFunctionBody()->GetLoopHeaderData(loopNum)->interpretCount, type, m_func), Js::OpCode::BrLt_A, checkDoBailout, checkDoBailout);
-                    this->InsertMove(dobailout, IR::IntConstOpnd::New(1, dobailoutType, m_func, true), checkDoBailout);
-                    // fallthrough
-
-                    // Label checkDoBailout (inserted above)
-                }
+                this->InsertMove(dobailout, IR::IntConstOpnd::New(0, dobailoutType, m_func, true), checkDoBailout);
+
+                this->InsertCompareBranch(countReg, IR::IntConstOpnd::New(m_func->GetJITFunctionBody()->GetLoopHeaderData(loopNum)->interpretCount, type, m_func), Js::OpCode::BrLt_A, checkDoBailout, checkDoBailout);
+                this->InsertMove(dobailout, IR::IntConstOpnd::New(1, dobailoutType, m_func, true), checkDoBailout);
+                // fallthrough
+
+                // Label checkDoBailout (inserted above)
             }
-            break;
+        }
+        break;
 
         case Js::OpCode::ProfiledLoopEnd:
-            {
-                Assert(m_func->DoSimpleJitDynamicProfile());
+        {
+            Assert(m_func->DoSimpleJitDynamicProfile());
 
-                // This is set up in IRBuilding
-                Assert(instr->GetSrc1());
-                IR::Opnd* savedFlags = instr->UnlinkSrc1();
+            // This is set up in IRBuilding
+            Assert(instr->GetSrc1());
+            IR::Opnd* savedFlags = instr->UnlinkSrc1();
 
-                m_lowererMD.LoadHelperArgument(instr, savedFlags);
-                m_lowererMD.LoadHelperArgument(instr, IR::Opnd::CreateUint32Opnd(instr->AsJitProfilingInstr()->loopNumber, m_func));
-                m_lowererMD.LoadHelperArgument(instr, IR::Opnd::CreateFramePointerOpnd(m_func));
-                instr->SetSrc1(IR::HelperCallOpnd::New(IR::HelperSimpleRecordLoopImplicitCallFlags, m_func));
-                m_lowererMD.LowerCall(instr, 0);
-            }
-            break;
+            m_lowererMD.LoadHelperArgument(instr, savedFlags);
+            m_lowererMD.LoadHelperArgument(instr, IR::Opnd::CreateUint32Opnd(instr->AsJitProfilingInstr()->loopNumber, m_func));
+            m_lowererMD.LoadHelperArgument(instr, IR::Opnd::CreateFramePointerOpnd(m_func));
+            instr->SetSrc1(IR::HelperCallOpnd::New(IR::HelperSimpleRecordLoopImplicitCallFlags, m_func));
+            m_lowererMD.LowerCall(instr, 0);
+        }
+        break;
 
         case Js::OpCode::InitLoopBodyCount:
             Assert(this->m_func->IsLoopBody());
@@ -2413,7 +2413,7 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
         {
             Js::ProfileId profileId;
             IR::Instr *profileBeforeInstr;
-            if(instr->IsJitProfilingInstr())
+            if (instr->IsJitProfilingInstr())
             {
                 profileId = instr->AsJitProfilingInstr()->profileId;
                 Assert(profileId != Js::Constants::NoProfileId);
@@ -2427,7 +2427,7 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
 
             this->LowerLdSlot(instr);
 
-            if(profileId != Js::Constants::NoProfileId)
+            if (profileId != Js::Constants::NoProfileId)
             {
                 LowerProfileLdSlot(instr->GetDst(), instr->m_func, profileId, profileBeforeInstr);
             }
@@ -2635,20 +2635,20 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             // and eventually generate the nativeOffset maps.
 #if DBG_DUMP && DBG
             // If we have a JITStatementBreakpoint, then we should break on this statement
+        {
+            uint32 statementIndex = instr->AsPragmaInstr()->m_statementIndex;
+            if (Js::Configuration::Global.flags.StatementDebugBreak.Contains(instr->m_func->GetSourceContextId(), instr->m_func->GetLocalFunctionId(), statementIndex))
             {
-                uint32 statementIndex = instr->AsPragmaInstr()->m_statementIndex;
-                if (Js::Configuration::Global.flags.StatementDebugBreak.Contains(instr->m_func->GetSourceContextId(), instr->m_func->GetLocalFunctionId(), statementIndex))
+                IR::Instr* tempinstr = instr;
+                Assert(tempinstr != nullptr);
+                // go past any labels, and then add a debug breakpoint
+                while (tempinstr->m_next != nullptr && tempinstr->m_next->m_opcode == Js::OpCode::Label)
                 {
-                    IR::Instr* tempinstr = instr;
-                    Assert(tempinstr != nullptr);
-                    // go past any labels, and then add a debug breakpoint
-                    while (tempinstr->m_next != nullptr && tempinstr->m_next->m_opcode == Js::OpCode::Label)
-                    {
-                        tempinstr = tempinstr->m_next;
-                    }
-                    this->m_lowererMD.GenerateDebugBreak(tempinstr);
+                    tempinstr = tempinstr->m_next;
                 }
+                this->m_lowererMD.GenerateDebugBreak(tempinstr);
             }
+        }
 #endif
         break;
 
@@ -2697,7 +2697,7 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             IR::Instr *bailOnNotArray = nullptr, *bailOnMissingValue = nullptr;
             SplitBailOnNotArray(instr, &bailOnNotArray, &bailOnMissingValue);
             IR::RegOpnd *const arrayOpnd = LowerBailOnNotArray(bailOnNotArray);
-            if(bailOnMissingValue)
+            if (bailOnMissingValue)
             {
                 LowerBailOnMissingValue(bailOnMissingValue, arrayOpnd);
             }
@@ -3023,6 +3023,43 @@ Lowerer::LowerRange(IR::Instr *instrStart, IR::Instr *instrEnd, bool defaultDoFa
             instr->Remove();
             break;
 
+        case Js::OpCode::SpeculatedLoadFence:
+        {
+            AssertOrFailFast(instr->m_kind == IR::InstrKindByteCodeUses);
+            IR::ByteCodeUsesInstr* bcuInstr = static_cast<IR::ByteCodeUsesInstr*>(instr);
+            // Most of the time we're not going to be able to remove any masking in a loop, and
+            // this instruction can be removed.
+#ifdef _M_ARM
+            AssertOrFailFastMsg(false, "We shouldn't perform this hoisting on ARM");
+#else
+            if (bcuInstr->GetByteCodeUpwardExposedUsed() != nullptr && !bcuInstr->GetByteCodeUpwardExposedUsed()->IsEmpty())
+            {
+                // The generated code is:
+                //
+                // cmp rax, rax
+                // for each symbol to mask:
+                // reg(sym) = cmovne reg(sym), reg(sym)
+                IR::RegOpnd* temp = IR::RegOpnd::New(TyUint8, instr->m_func);
+                InsertMove(temp, IR::IntConstOpnd::New(0, TyUint8, instr->m_func), instr);
+                IR::Instr * cmp = IR::Instr::New(Js::OpCode::CMP, instr->m_func);
+                cmp->SetSrc1(temp);
+                cmp->SetSrc2(temp);
+                instr->InsertBefore(cmp);
+                m_lowererMD.Legalize(cmp);
+                FOREACH_BITSET_IN_SPARSEBV(symid, bcuInstr->GetByteCodeUpwardExposedUsed())
+                {
+                    StackSym* thisSym = instr->m_func->m_symTable->Find(symid)->AsStackSym();
+                    IR::RegOpnd* thisSymReg = IR::RegOpnd::New(thisSym, thisSym->GetType(), instr->m_func);
+                    IR::Instr* cmov = IR::Instr::New(LowererMD::MDSpecBlockNEOpcode, thisSymReg, thisSymReg, thisSymReg, instr->m_func);
+                    instr->InsertBefore(cmov);
+                    m_lowererMD.Legalize(cmov);
+                } NEXT_BITSET_IN_SPARSEBV;
+            }
+#endif
+            instr->Remove();
+            break;
+        }
+
 #endif //ENABLE_WASM
 
         default:
@@ -16897,7 +16934,7 @@ Lowerer::GenerateFastElemIIntIndexCommon(
     // Should we poison the load of the address to/from which the store/load happens?
     bool shouldPoisonLoad = maskOpnd != nullptr
         && (
-            (!isStore &&
+            (!isStore && (!instr->IsSafeToSpeculate()) &&
                 (baseValueType.IsLikelyTypedArray()
                     ? CONFIG_FLAG_RELEASE(PoisonTypedArrayLoad)
                     : ((indirType == TyVar && CONFIG_FLAG_RELEASE(PoisonVarArrayLoad))

+ 1 - 0
lib/Backend/LowerMDShared.cpp

@@ -21,6 +21,7 @@ const Js::OpCode LowererMD::MDConvertFloat64ToFloat32Opcode = Js::OpCode::CVTSD2
 const Js::OpCode LowererMD::MDCallOpcode = Js::OpCode::CALL;
 const Js::OpCode LowererMD::MDImulOpcode = Js::OpCode::IMUL2;
 const Js::OpCode LowererMD::MDLea = Js::OpCode::LEA;
+const Js::OpCode LowererMD::MDSpecBlockNEOpcode = Js::OpCode::CMOVNE;
 
 static const int TWO_31_FLOAT = 0x4f000000;
 static const int FLOAT_INT_MIN = 0xcf000000;

+ 1 - 0
lib/Backend/LowerMDShared.h

@@ -80,6 +80,7 @@ public:
     static const Js::OpCode MDCallOpcode;
     static const Js::OpCode MDImulOpcode;
     static const Js::OpCode MDLea;
+    static const Js::OpCode MDSpecBlockNEOpcode;
 
     UINT FloatPrefThreshold;
 

+ 1 - 0
lib/Backend/Sym.h

@@ -28,6 +28,7 @@ enum SymKind : BYTE
 };
 
 typedef uint32 SymID;
+constexpr SymID SymID_Invalid = (SymID)-1;
 
 
 ///---------------------------------------------------------------------------

+ 1 - 1
lib/Backend/TempTracker.cpp

@@ -818,7 +818,7 @@ NumberTemp::GetRepresentativePropertySymId(PropertySym * propertySym, BackwardPa
 {
     // Since we don't track alias with objects, all property accesses are all grouped together.
     // Use a single property sym id to represent a propertyId to track dependencies.
-    SymID symId = (SymID)-1;
+    SymID symId = SymID_Invalid;
     Js::PropertyId propertyId = propertySym->m_propertyId;
     if (!backwardPass->numberTempRepresentativePropertySym->TryGetValue(propertyId, &symId))
     {

+ 1 - 0
lib/Backend/arm64/LowerMD.cpp

@@ -18,6 +18,7 @@ const Js::OpCode LowererMD::MDConvertFloat64ToFloat32Opcode = Js::OpCode::FCVT;
 const Js::OpCode LowererMD::MDCallOpcode = Js::OpCode::Call;
 const Js::OpCode LowererMD::MDImulOpcode = Js::OpCode::MUL;
 const Js::OpCode LowererMD::MDLea = Js::OpCode::LEA;
+const Js::OpCode LowererMD::MDSpecBlockNEOpcode = Js::OpCode::CSELNE;
 
 template<typename T>
 inline void Swap(T& x, T& y)

+ 1 - 0
lib/Backend/arm64/LowerMD.h

@@ -61,6 +61,7 @@ public:
     static const Js::OpCode MDCallOpcode;
     static const Js::OpCode MDImulOpcode;
     static const Js::OpCode MDLea;
+    static const Js::OpCode MDSpecBlockNEOpcode;
 
 public:
             void            Init(Lowerer *lowerer);

+ 1 - 0
lib/Common/CommonMin.h

@@ -55,6 +55,7 @@ using namespace Memory;
 #include "DataStructures/KeyValuePair.h"
 #include "DataStructures/BaseDictionary.h"
 #include "DataStructures/DictionaryEntry.h"
+#include "DataStructures/ClusterList.h"
 
 // === Configurations Header ===
 #include "Core/ConfigFlagsTable.h"

+ 1 - 0
lib/Common/ConfigFlagsList.h

@@ -210,6 +210,7 @@ PHASE(All)
                     PHASE(MarkTempNumber)
                     PHASE(MarkTempObject)
                     PHASE(MarkTempNumberOnTempObject)
+                PHASE(SpeculationPropagationAnalysis)
         PHASE(DumpGlobOptInstr) // Print the Globopt instr string in post lower dumps
         PHASE(Lowerer)
             PHASE(FastPath)

+ 1 - 0
lib/Common/DataStructures/Chakra.Common.DataStructures.vcxproj

@@ -52,6 +52,7 @@
     <ClInclude Include="BufferBuilder.h" />
     <ClInclude Include="Cache.h" />
     <ClInclude Include="CharacterBuffer.h" />
+    <ClInclude Include="ClusterList.h" />
     <ClInclude Include="CommonDataStructuresPch.h" />
     <ClInclude Include="ContinuousPageStack.h" />
     <ClInclude Include="DefaultContainerLockPolicy.h" />

+ 420 - 0
lib/Common/DataStructures/ClusterList.h

@@ -0,0 +1,420 @@
+//-------------------------------------------------------------------------------------------------------
+// Copyright (C) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
+//-------------------------------------------------------------------------------------------------------
+#pragma once
+
+#include "../Core/Assertions.h"
+#include "../Memory/Allocator.h"
+
+/*
+ * ClusterLists are intended to solve the problem of determining what connected set
+ * groups are present in a graph while iterating over a list of connections.
+ */
+
+template<typename indexType, class TAllocator>
+class ClusterList
+{
+private:
+    // The maximum index enabled; basically an array bounds for the set.
+    indexType maxIndex;
+    // The low-level data store backing the structure. This is a list initialized as the
+    // identity mapping (list[i] = i), and is then updated as links are added so that an
+    // entry in the list is the index of an entry of equal or lower number which is part
+    // of the same set. This is later updated to be the index of the element with lowest
+    // index in the set for all entries, avoiding situations where lookups would need to
+    // follow a chain.
+    indexType* list;
+#if DBG
+    // During the merge process, we may end up with some situations where set membership
+    // is chained; that is, that node 7 is in set 5, but 5 is in set 4. Following all of
+    // the merge operations we have a step to reconcile all of this information and then
+    // update each element of the list to point to the least member of its set. This can
+    // be checked in debug builds to see if we've done this step or not.
+    bool consolidated = false;
+#endif
+    TAllocator* alloc;
+public:
+    // Create a new ClusterList with a specified maximum node number
+    ClusterList(TAllocator* alloc, indexType maxCount)
+        : maxIndex(maxCount)
+        , list(nullptr)
+        , alloc(alloc)
+#if DBG
+        , consolidated(true)
+#endif
+    {
+        list = AllocatorNewArrayLeaf(TAllocator, this->alloc, indexType, maxCount);
+        for (indextype i = 0; i < maxIndex; i++)
+        {
+            list[i] = i;
+        }
+    }
+    // Merge the set containing node a and the set containing node b
+    void Merge(indexType a, indexType b)
+    {
+        indexType aVal = GetSet<false>(a);
+        indexType bVal = GetSet<false>(b);
+        if (aVal == bVal)
+            return;
+        indexType min = (aVal < bVal ? aVal : bVal);
+        list[aVal] = min;
+        list[bVal] = min;
+#if DBG
+        consolidated = false;
+#endif
+    }
+    ~ClusterList()
+    {
+        if (this->list != nullptr)
+        {
+            AllocatorDeleteArrayLeaf(TAllocator, this->alloc, maxCount, this->list);
+            this->list = nullptr;
+        }
+    }
+    // Do a pass to update each set membership reference to the minimum for the set
+    void Consolidate()
+    {
+        for (indexType i = 0; i < maxIndex; i++)
+        {
+            list[i] = list[list[i]];
+        }
+#if DBG
+        consolidated = true;
+#endif
+    }
+    // Reset the list; useful if we're re-using the data structure
+    void Reset()
+    {
+        for (indextype i = 0; i < maxIndex; i++)
+        {
+            list[i] = i;
+        }
+#if DBG
+        consolidated = true;
+#endif
+    }
+    // Get the index of the least element in the set, which serves as a unique set identifier
+    // (note that if further merges happen, you may end up in a set with a different number)
+    template<bool assumeConsolidated>
+    indexType GetSet(indexType in)
+    {
+        if (assumeConsolidated)
+        {
+            Assert(consolidated);
+            return list[in];
+        }
+        else
+        {
+            if (list[in] == in)
+            {
+                return in;
+            }
+            else
+            {
+                indexType actualSet = GetSet<false>(list[in]);
+                list[in] = actualSet;
+                return actualSet;
+            }
+        }
+    }
+    // Map a function on index, setnumber for all nodes in the set
+    // note that it'll run on nodes that were never involved in any merges and are thus in solo sets
+    template<typename MapAccessoryType>
+    inline void Map(void(*callBack)(indexType, indexType, MapAccessoryType), MapAccessoryType accessory)
+    {
+        Assert(consolidated);
+        for (indexType i = 0; i < maxIndex; i++)
+        {
+            callBack(i, list[i], accessory);
+        }
+    }
+};
+
+/*
+ * SegmentClusterList differs from a normal ClusterList in that we allocate regions
+ * of the list instead of the entire list. This should provide good performance due
+ * to the multi-pass nature of the globopt; regions of code will generally generate
+ * stacksyms in one region per pass (since they're allocated incrementally), so the
+ * numbers used will normally be somewhat grouped. This saves us allocating most of
+ * the array in cases where we have large functions (which is more common than code
+ * input may indicate due to inlining), assuming that the data structure would need
+ * to be allocated on a per-block basis otherwise.
+ */
+
+template<typename indexType, class TAllocator, unsigned int numPerSegment = 16>
+class SegmentClusterList
+{
+private:
+    // The maximum index enabled; basically an array bounds for the set.
+    indexType maxIndex;
+    // The backing store for this ClusterList type is an array of pointers to subsegment
+    // structures, each of which stores a fixed region. Elements outside this region are
+    // assumed to have an identity mapping before first use.
+    indexType **backingStore;
+#if DBG
+    // During the merge process, we may end up with some situations where set membership
+    // is chained; that is, that node 7 is in set 5, but 5 is in set 4. Following all of
+    // the merge operations we have a step to reconcile all of this information and then
+    // update each element of the list to point to the least member of its set. This can
+    // be checked in debug builds to see if we've done this step or not.
+    bool consolidated = false;
+#endif
+    TAllocator* alloc;
+    inline void EnsureBaseSize(indexType inputMax)
+    {
+        // We want to ensure that we can store to inputMax, which means we need to grow to inputMax+1
+        indexType targetSize = inputMax + 1;
+        // round up to the next numPerSegment multiple
+        indexType max = (targetSize % numPerSegment == 0) ? targetSize : (targetSize + numPerSegment - (targetSize % numPerSegment));
+        // get segment counts for allocation
+        indexType newNumSegments = max / numPerSegment;
+        indexType curNumSegments = maxIndex / numPerSegment;
+        if (newNumSegments > curNumSegments)
+        {
+            // Note that if you want to use this with something other than an ArenaAllocator, then
+            // you'll have to add clean-up mechanics
+            indexType **tempList = AllocatorNewArray(TAllocator, alloc, indexType*, newNumSegments);
+            for (indexType i = 0; i < curNumSegments; i++)
+            {
+                tempList[i] = backingStore[i];
+            }
+            for (indexType i = curNumSegments; i < newNumSegments; i++)
+            {
+                tempList[i] = nullptr;
+            }
+            // Set the max index to the number of supported indicies
+            maxIndex = max;
+            // replace the backingStore with the new backing Store
+            indexType **oldBackingStore = backingStore;
+            backingStore = tempList;
+            AllocatorDeleteArray(TAllocator, alloc, curNumSegments, oldBackingStore);
+        }
+    }
+    inline void CreateBacking(indexType index)
+    {
+        Assert(index >= maxIndex || backingStore[index / numPerSegment] == nullptr);
+        // grow the size of the pointer array if needed
+        EnsureBaseSize(index);
+        if (backingStore[index / numPerSegment] == nullptr)
+        {
+            // allocate a new segment
+            backingStore[index / numPerSegment] = AllocatorNewArrayLeaf(TAllocator, alloc, indexType, numPerSegment);
+            indexType baseForSegment = (index / numPerSegment) * numPerSegment;
+            for (indexType i = 0; i < numPerSegment; i++)
+            {
+                backingStore[index / numPerSegment][i] = i + baseForSegment;
+            }
+        }
+    }
+    template<bool createBacking = false, bool assumeConsolidated = false>
+    inline indexType Lookup(indexType index)
+    {
+        if (index >= maxIndex || backingStore[index / numPerSegment] == nullptr)
+        {
+            // Not being there simply means that it's still an identity mapping
+            if(createBacking)
+            {
+                CreateBacking(index);
+            }
+            return index;
+        }
+        if (assumeConsolidated)
+        {
+            // If the list is consolidated, then we can return whatever's there
+            return backingStore[index / numPerSegment][index % numPerSegment];
+        }
+        else
+        {
+            // If it's not consolidated, we need to follow the chain down, and update each entry to the root
+            indexType currentValue = backingStore[index / numPerSegment][index % numPerSegment];
+            if (currentValue == index)
+            {
+                return index;
+            }
+            indexType trueRoot = Lookup<false, assumeConsolidated>(currentValue);
+            backingStore[index / numPerSegment][index % numPerSegment] = trueRoot;
+            return trueRoot;
+        }
+    }
+
+    template<bool assumeExists = false>
+    inline void Assign(indexType index, indexType value)
+    {
+        if (!assumeExists)
+        {
+            if (index >= maxIndex || backingStore[index / numPerSegment] == nullptr)
+            {
+                CreateBacking(index);
+            }
+        }
+        backingStore[index / numPerSegment][index % numPerSegment] = value;
+    }
+public:
+    // Create a new ClusterList with a specified maximum node number
+    SegmentClusterList(TAllocator* allocator, indexType maxCount)
+        : maxIndex(maxCount % numPerSegment == 0 ? maxCount : maxCount + (numPerSegment - (maxCount % numPerSegment)))
+        , backingStore(nullptr)
+        , alloc(allocator)
+#if DBG
+        , consolidated(true)
+#endif
+    {
+        backingStore = AllocatorNewArrayZ(TAllocator, alloc, indexType*, maxIndex / numPerSegment);
+    }
+    ~SegmentClusterList()
+    {
+        if (this->backingStore != nullptr)
+        {
+            // Reset is just a delete + nullptr on all segments
+            this->Reset();
+            AllocatorDeleteArray(TAllocator, alloc, maxIndex / numPerSegment, this->backingStore);
+            this->backingStore = nullptr;
+        }
+    }
+    // Merge the set containing node a and the set containing node b
+    void Merge(indexType a, indexType b)
+    {
+        if (a == b)
+            return;
+        indexType aVal = Lookup<true>(a);
+        indexType bVal = Lookup<true>(b);
+        indexType min = (aVal < bVal ? aVal : bVal);
+        // We need to update the roots of both branches to point to the min
+        Assign<false>(aVal, min);
+        Assign<false>(bVal, min);
+#if DBG
+        if(aVal != min || bVal != min)
+            consolidated = false;
+#endif
+    }
+    // Do a pass to update each set membership reference to the minimum for the set
+    void Consolidate()
+    {
+        for (indexType i = 0; i < maxIndex / numPerSegment; i++)
+        {
+            if (backingStore[i] != nullptr)
+            {
+                for (indexType j = 0; j < numPerSegment; j++)
+                {
+                    // We can assumeConsolidated here because everything less than the current index is consolidated
+                    backingStore[i][j] = Lookup<false, true>(backingStore[i][j]);
+                }
+            }
+        }
+#if DBG
+        consolidated = true;
+#endif
+    }
+    // Reset the list; useful if we're re-using the data structure
+    void Reset()
+    {
+        for (indexType i = 0; i < maxIndex / numPerSegment; i++)
+        {
+            if (backingStore[i] != nullptr)
+            {
+                AllocatorDeleteArrayLeaf(TAllocator, alloc, numPerSegment, backingstore[i]);
+                backingStore[i] = nullptr;
+            }
+        }
+#if DBG
+        consolidated = true;
+#endif
+    }
+    // Get the index of the least element in the set, which serves as a unique set identifier
+    // (note that if further merges happen, you may end up in a set with a different number)
+    indexType GetSet(indexType in)
+    {
+        Assert(consolidated);
+        return Lookup<false>(in);
+    }
+    // Map a function on index, setnumber, accessory for all nodes in the set (or just non-identities)
+    // note that it'll run on nodes that were never involved in any merges and are thus in solo sets
+    template<typename MapAccessoryType, bool onlyNonIdentity = false>
+    inline void Map(void(*callBack)(indexType, indexType, MapAccessoryType), MapAccessoryType accessory)
+    {
+        Assert(consolidated);
+        if (onlyNonIdentity)
+        {
+            for (indexType i = 0; i < maxIndex / numPerSegment; i++)
+            {
+                if (backingStore[i] == nullptr)
+                {
+                    continue;
+                }
+                for (indexType j = 0; j < numPerSegment; j++)
+                {
+                    indexType index = i * numPerSegment + j;
+                    indexType local = Lookup<false, true>(index);
+                    if (index != local)
+                    {
+                        callBack(index, local, accessory);
+                    }
+                }
+            }
+        }
+        else
+        {
+            for (indexType i = 0; i < maxIndex; i++)
+            {
+                callBack(i, Lookup<false, true>(i), accessory);
+            }
+        }
+    }
+
+    // Map a function across the set for a particular index
+    template<typename MapAccessoryType, bool onlyNonIdentity = false>
+    inline void MapSet(indexType baseSetMember, void(*callBack)(indexType, MapAccessoryType), MapAccessoryType accessory)
+    {
+        Assert(consolidated);
+        indexType baseSet = Lookup<false, true>(baseSetMember);
+        if (!onlyNonIdentity)
+        {
+            callBack(baseSet, accessory);
+        }
+        // We now only need to look at the stuff greater than baseSet
+        for (indexType i = baseSet + 1; i < maxIndex; i++)
+        {
+            if (backingStore[i / numPerSegment] == nullptr)
+            {
+                // advance to the next block if this one is an identity block
+                i += (numPerSegment - (i % numPerSegment)) - 1;
+                continue;
+            }
+            if (backingStore[i / numPerSegment][i % numPerSegment] == baseSet)
+            {
+                callBack(i, accessory);
+            }
+        }
+    }
+
+#if DBG_DUMP
+    void Dump()
+    {
+        bool printed = false;
+        Output::Print(_u("["));
+        for (indexType i = 0; i < maxIndex / numPerSegment; i++)
+        {
+            if (backingStore[i] == nullptr)
+            {
+                continue;
+            }
+            for (indexType j = 0; j < numPerSegment; j++)
+            {
+                indexType index = i * numPerSegment + j;
+                indexType local = Lookup<false, true>(index);
+                if (index != local)
+                {
+                    if (printed)
+                    {
+                        Output::Print(_u(", "));
+                    }
+                    Output::Print(_u("%u <= %u"), local, index);
+                    printed = true;
+                }
+            }
+        }
+        Output::Print(_u("]\n"));
+    }
+#endif
+};

+ 6 - 6
lib/Common/DataStructures/SparseBitVector.h

@@ -14,7 +14,7 @@ typedef  BVUnit64 SparseBVUnit;
         BVIndex _offset; \
         BVIndex _startIndex = _curNode->startIndex; \
         SparseBVUnit  _unit = _curNode->data; \
-        for(_offset = _unit.GetNextBit(); _offset != -1; _offset = _unit.GetNextBit()) \
+        for(_offset = _unit.GetNextBit(); _offset != BVInvalidIndex; _offset = _unit.GetNextBit()) \
         { \
             index = _startIndex + _offset; \
             _unit.Clear(_offset); \
@@ -43,7 +43,7 @@ typedef  BVUnit64 SparseBVUnit;
         BVIndex _offset; \
         BVIndex _startIndex = _curNodeEdit->startIndex; \
         SparseBVUnit  _unit = _curNodeEdit->data; \
-        for(_offset = _unit.GetNextBit(); _offset != -1; _offset = _unit.GetNextBit()) \
+        for(_offset = _unit.GetNextBit(); _offset != BVInvalidIndex; _offset = _unit.GetNextBit()) \
         { \
             index = _startIndex + _offset; \
             _unit.Clear(_offset); \
@@ -530,12 +530,12 @@ BVSparse<TAllocator>::GetNextBit(BVSparseNode *node) const
     while(0 != node)
     {
         BVIndex ret = node->data.GetNextBit();
-        if(-1 != ret)
+        if(BVInvalidIndex != ret)
         {
             return ret + node->startIndex;
         }
     }
-    return -1;
+    return BVInvalidIndex;
 }
 
 template <class TAllocator>
@@ -549,7 +549,7 @@ BVSparse<TAllocator>::GetNextBit(BVIndex i) const
         if(startIndex == node->startIndex)
         {
             BVIndex ret = node->data.GetNextBit(SparseBVUnit::Offset(i));
-            if(-1 != ret)
+            if(BVInvalidIndex != ret)
             {
                 return ret + node->startIndex;
             }
@@ -564,7 +564,7 @@ BVSparse<TAllocator>::GetNextBit(BVIndex i) const
         }
     }
 
-    return  -1;
+    return BVInvalidIndex;
 }
 
 template <class TAllocator>

+ 2 - 0
lib/Runtime/ByteCode/OpCodes.h

@@ -797,6 +797,8 @@ MACRO_BACKEND_ONLY(     LdSpreadIndices,    Empty,          None)
 
 MACRO_EXTEND_WMS(       ClearAttributes,    ElementU,       None)
 
+MACRO_BACKEND_ONLY(     SpeculatedLoadFence,Reg1,           None)
+
 MACRO_EXTEND_WMS(       LdHomeObj,          Reg1,           OpSideEffect)
 MACRO_EXTEND_WMS(       LdFuncObj,          Reg1,           OpSideEffect)
 MACRO_EXTEND_WMS(       LdHomeObjProto,     Reg2,           OpSideEffect)