//------------------------------------------------------------------------------------------------------- // Copyright (C) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information. //------------------------------------------------------------------------------------------------------- #pragma once //----------------------------------------------------------------------------- enum EditKind { // No change. None = 0, // Node value was updated. Update, // Node was inserted. Insert, // Node was deleted. Delete, // Node changed parent. Move, // Node changed position within its parent. The parent nodes of the old node and the new node are matching. Reorder, }; //----------------------------------------------------------------------------- // Calculates Longest Common Subsequence. // This uses the basic version in // // EUGENE W. MYERS: An O(ND) Difference Algorithm and Its Variations // // The idea is that LCS is a dual problem of shortest path in an edit graph. The edit graph is a grid of lengthA // columns and lengthB rows. A path starts from (0,0) and moves toward (lengthA, lengthB). // - A horizontal move (i,j) -> (i+1,j) represents deleting A[i]. // - A vertical move (i,j) -> (i,j+1) represents inserting B[j]. // - A diagonal move (i,j) -> (i+1,j+1) represents a match, A[i] == B[j]. // Each diagonal move represents a match. We want more diagonal moves. Let diagonal move cost 0, horizontal or // vertical move each costs 1. The basic algorithm is a greedy algorithm to find a shortest path from (0,0) to // (lengthA, lengthB). // // Terms: // diagonal k: The diagonal where x-y==k. // d-path: A path starting from (0,0) with d number of horizontal or vertical moves. Or, its length is d (note // that each horizontal/vertical move costs 1, diagonal move costs 0). // // 0-path can only move along and end on diagonal 0. // 1-path can only end on diagonal -1 or 1. // d-path can end on diagonal [-d, -d+2, ..., d-2, d]. // // The basic algorithm tries to find the smallest d, where there is a d-path reaches (lengthA, lengthB). //----------------------------------------------------------------------------- template class LongestCommonSubsequence { private: // Stores d-path furthest reaching endpoints. They can be on diagonal [-d, -d+2, ..., d-2, d]. class EndPoints { private: int d; int x[]; // Stores x for endpoints on the (d+1) diagonals. y == x - k. EndPoints(int d) : d(d) { } public: int getd() const { return d; } // Get x of furthest reaching endpoint on diagonal k. int operator[](int k) const { Assert(k >= -d && k <= d && (d - k) % 2 == 0); // k must be in [-d, -d+2, ..., d-2, d] int i = (k + d) / 2; return x[i]; } // Get x reference of furthest reaching endpoint on diagonal k. int& operator[](int k) { Assert(k >= -d && k <= d && (d - k) % 2 == 0); // k must be in [-d, -d+2, ..., d-2, d] int i = (k + d) / 2; return x[i]; } static EndPoints* New(Allocator* alloc, int d) { Assert(d >= 0); return AllocatorNewPlusLeaf(Allocator, alloc, sizeof(int) * (d + 1), EndPoints, d); } void Destroy(Allocator* alloc) { AllocatorDeletePlusLeaf(Allocator, alloc, sizeof(int) * (d + 1), this); } }; // Represents an EditGraph for finding LCS class EditGraph { private: typedef JsUtil::List EndPointsList; EndPointsList m_endPoints; // Stores endPoints for paths: -1, 0, 1, ..., d int m_diagonal; // The final diagonal found on d-path that reaches destination // Add EndPoints storage for d-path EndPoints* AddEndPoints(int d) { int i = m_endPoints.Add(nullptr); EndPoints* e = EndPoints::New(m_endPoints.GetAllocator(), d); m_endPoints.Item(i, e); return e; } public: EditGraph(Allocator* alloc) : m_endPoints(alloc) {} ~EditGraph() { Allocator* alloc = m_endPoints.GetAllocator(); m_endPoints.Map([=](int, EndPoints* e) { if (e) { e->Destroy(alloc); } }); } // // This is the basic algorithm to find a shortest path in the edit graph from (0,0) to (lengthA, lengthB). // We iterate through d=0,1,2,... to find smallest d, where one d-path reaches (lengthA, lengthB). // - d-path must end on diagonal [-d, -d+2, ..., d-2, d] // - A furthest reaching d-path on diagonal k is composed of a furthest reaching d-1 path on diagonal k-1 or k+1, // followed by a vertical or horizontal move, followed by moving along diagonal k. // template void FindPath(int lengthA, int lengthB, const ItemEquals& equals) { Assert(m_endPoints.Empty()); // Only support one FindPath int maxD; if (Int32Math::Add(lengthA, lengthB, &maxD) || maxD > INT_MAX / 2) // Limits maxD to simplify overflow handling { Math::DefaultOverflowPolicy(); } // Pre-add virtual path -1 { EndPoints& pre = *AddEndPoints(1); pre[1] = 0; } bool found = false; for (int d = 0; d <= maxD && !found; d++) { const EndPoints& v = *m_endPoints.Item(d); // d-1 path EndPoints& cur = *AddEndPoints(d); // d path for (int k = -d; k <= d; k += 2) { const bool verticalMove = (k == -d || (k != d && v[k - 1] < v[k + 1])); int x = verticalMove ? v[k + 1] : v[k - 1] + 1; int y = x - k; while (x < lengthA && y < lengthB && equals(x, y)) { x++; y++; } cur[k] = x; // furthest reaching end point if (x == lengthA && y == lengthB) { m_diagonal = k; found = true; break; } } } Assert(found); } template void MapEdits(const Func& map) const { // m_endPoints contains endPoints for paths: -1, 0, 1, ..., d int d = m_endPoints.Count() - 2; int k = m_diagonal; for (; d >= 0; d--) { const EndPoints& v = *m_endPoints.Item(d); // d-1 path const EndPoints& cur = *m_endPoints.Item(d + 1); // d path Assert(cur.getd() == d); const bool verticalMove = (k == -d || (k != d && v[k - 1] < v[k + 1])); int x0 = verticalMove ? v[k + 1] : v[k - 1] + 1; int x = cur[k]; int y = x - k; while (x > x0) { map(EditKind::Update, --x, --y); } if (verticalMove) { if (d > 0) // Don't emit virtual initial move from path -1 to 0 { map(EditKind::Insert, -1, --y); } k++; } else { map(EditKind::Delete, --x, -1); k--; } } } }; struct Edit { EditKind kind; int indexA; int indexB; Edit() {} Edit(EditKind kind, int indexA, int indexB) : kind(kind), indexA(indexA), indexB(indexB) { Assert((kind == EditKind::Insert && indexA == -1 && indexB >= 0) || (kind == EditKind::Delete && indexA >= 0 && indexB == -1) || (kind == EditKind::Update && indexA >= 0 && indexB >= 0)); } }; typedef JsUtil::List EditList; EditList m_edits; public: template LongestCommonSubsequence(Allocator* alloc, int lengthA, int lengthB, const ItemEquals& equals) : m_edits(alloc) { EditGraph graph(alloc); graph.FindPath(lengthA, lengthB, equals); graph.MapEdits([this](EditKind kind, int indexA, int indexB) { m_edits.Add(Edit(kind, indexA, indexB)); }); } template void MapEdits(const Func& map) const { for (int i = m_edits.Count() - 1; i >= 0; i--) { const Edit& e = m_edits.Item(i); map(e.kind, e.indexA, e.indexB); } } template void MapMatches(const Func& map) const { MapEdits([&](EditKind kind, int indexA, int indexB) { if (kind == EditKind::Update) { map(indexA, indexB); } }); } }; // // Returns a distance [0..1] of the specified sequences. The smaller distance the more of their elements match. // template double ComputeLongestCommonSubsequenceDistance(Allocator* alloc, int lengthA, int lengthB, const ItemEquals& equals) { Assert(lengthA >= 0 && lengthB >= 0); if (lengthA == 0 || lengthB == 0) { return (lengthA == lengthB) ? 0.0 : 1.0; } int lcsLength = 0; LongestCommonSubsequence lcs(alloc, lengthA, lengthB, equals); lcs.MapMatches([&](int, int) { ++lcsLength; }); return 1.0 - (double)lcsLength / (double)max(lengthA, lengthB); } //----------------------------------------------------------------------------- // Base class for TreeComparers, used with TreeMatch. TreeComparers specify parse node details. //----------------------------------------------------------------------------- template struct TreeComparerBase { typedef Node Node; typedef Node* PNode; static const double ExactMatchDistance; static const double EpsilonDistance; const SubClass* pThis() const { return static_cast(this); } SubClass* pThis() { return static_cast(this); } // The number of distinct labels used in the tree. int LabelCount() const { return 0; } // Returns an integer label corresponding to the given node. // Returned value must be within [0, LabelCount). int GetLabel(PNode x) const { return 0; } // Returns N > 0 if the node with specified label can't change its N-th ancestor node, zero otherwise. // 1st ancestor is the node's parent node. // 2nd ancestor is the node's grandparent node. // etc. int TiedToAncestor(int label) { return 0; } // Calculates the distance [0..1] of two nodes. // The more similar the nodes the smaller the distance. // // Used to determine whether two nodes of the same label match. // Even if 0 is returned the nodes might be slightly different. double GetDistance(PNode x, PNode y) const { return 0; } // Returns true if the specified nodes have equal values. // Called with matching nodes (oldNode, newNode). // Return true if the values of the nodes are the same, or their difference is not important. bool ValuesEqual(PNode oldNode, PNode newNode) const { return true; } PNode GetParent(PNode x) const { return nullptr; } bool TryGetParent(PNode x, _Out_ PNode* p) const { *p = pThis()->GetParent(x); return *p != nullptr; } PNode GetAncestor(PNode node, int level) const { while (level > 0) { node = pThis()->GetParent(node); level--; } return node; } // Map children nodes of x template void MapChildren(PNode x, const Func& func) const {} // Map all descendant nodes of x (not including x itself) template void MapDescendants(PNode x, const Func& func) const { pThis()->MapChildren(x, [&](PNode child) { func(child); MapDescendants(child, func); }); } // Map every node in the (sub)tree x. template void MapTree(PNode x, const Func& func) const { func(x); pThis()->MapDescendants(x, func); } // Return true if specified nodes belong to the same tree. For debug only. bool TreesEqual(PNode left, PNode right) const { return true; } }; template const double TreeComparerBase::ExactMatchDistance = 0.0; template const double TreeComparerBase::EpsilonDistance = 0.00001; //----------------------------------------------------------------------------- // Tree match algorithm, based on general algorithm described in // Change Detection in Hierarchically Structured Information // by Sudarshan S. Chawathe, Anand Rajaraman, Hector Garcia-Molina, and Jennifer Widom // // Derived from Roslyn implementation. //----------------------------------------------------------------------------- template class TreeMatch { public: // ParseNodes are owned by Parser arena. Considered leaf here. typedef typename TreeComparer::PNode PNode; typedef JsUtil::List NodeList; typedef JsUtil::BaseDictionary::AllocatorType> NodeMap; private: static const double ExactMatchDistance; static const double EpsilonDistance; static const double MatchingDistance1; static const double MatchingDistance2; static const double MatchingDistance3; static const double MaxDistance; Allocator* alloc; const PNode root1; const PNode root2; TreeComparer comparer; NodeMap* oneToTwo; NodeMap* twoToOne; public: TreeMatch(Allocator* alloc, PNode root1, PNode root2, const TreeComparer& comparer = TreeComparer()) : alloc(alloc), root1(root1), root2(root2), comparer(comparer) { const int labelCount = comparer.LabelCount(); // calculate chains (not including root node) AutoAllocatorObjectArrayPtr nodes1(AllocatorNewArrayZ(Allocator, alloc, NodeList*, labelCount), labelCount, alloc); AutoAllocatorObjectArrayPtr nodes2(AllocatorNewArrayZ(Allocator, alloc, NodeList*, labelCount), labelCount, alloc); int count1 = CategorizeNodesByLabels(root1, labelCount, nodes1); int count2 = CategorizeNodesByLabels(root2, labelCount, nodes2); AutoAllocatorObjectPtr map1(AllocatorNew(Allocator, alloc, NodeMap, alloc, count1), alloc); AutoAllocatorObjectPtr map2(AllocatorNew(Allocator, alloc, NodeMap, alloc, count2), alloc); this->oneToTwo = map1; this->twoToOne = map2; ComputeMatch(nodes1, nodes2, labelCount); // Succeeded. Detach local objects that are now owned by this instance. map1.Detach(); map2.Detach(); } ~TreeMatch() { DeleteObject(alloc, oneToTwo); DeleteObject(alloc, twoToOne); } const TreeComparer& Comparer() const { return comparer; } PNode OldRoot() const { return root1; } PNode NewRoot() const { return root2; } bool HasPartnerInTree1(PNode node2) const { Assert(comparer.TreesEqual(node2, root2)); return twoToOne->ContainsKey(node2); } bool HasPartnerInTree2(PNode node1) const { Assert(comparer.TreesEqual(node1, root1)); return oneToTwo->ContainsKey(node1); } bool TryGetPartnerInTree1(PNode node2, PNode* partner1) const { Assert(comparer.TreesEqual(node2, root2)); return twoToOne->TryGetValue(node2, partner1); } bool TryGetPartnerInTree2(PNode node1, PNode* partner2) const { Assert(comparer.TreesEqual(node1, root1)); return oneToTwo->TryGetValue(node1, partner2); } bool Contains(PNode node1, PNode node2) const { Assert(comparer.TreesEqual(node2, root2)); PNode partner2; return TryGetPartnerInTree2(node1, &partner2) && node2 == partner2; } private: int CategorizeNodesByLabels(PNode root, int labelCount, _Out_writes_(labelCount) NodeList* nodes[]) { int count = 0; comparer.MapDescendants(root, [&](PNode node) { int label = comparer.GetLabel(node); Assert(label >= 0 && label < labelCount); NodeList* list = nodes[label]; if (!list) { list = NodeList::New(alloc); nodes[label] = list; } list->Add(node); count++; }); return count; } void ComputeMatch(_In_reads_(labelCount) NodeList* nodes1[], _In_reads_(labelCount) NodeList* nodes2[], int labelCount) { // Root nodes always match but they might have been added as knownMatches if (!HasPartnerInTree2(root1)) { Add(root1, root2); } // --- The original FastMatch algorithm --- // // For each leaf label l, and then for each internal node label l do: // a) S1 := chain T1(l) // b) S2 := chain T2(l) // c) lcs := LCS(S1, S2, Equal) // d) For each pair of nodes (x,y) in lcs add (x,y) to M. // e) Pair unmatched nodes with label l as in Algorithm Match, adding matches to M: // For each unmatched node x in T1, if there is an unmatched node y in T2 such that equal(x,y) // then add (x,y) to M. // // equal(x,y) is defined as follows: // x, y are leafs => equal(x,y) := label(x) == label(y) && compare(value(x), value(y)) <= f // x, y are nodes => equal(x,y) := label(x) == label(y) && |common(x,y)| / max(|x|, |y|) > t // where f, t are constants. // // --- Actual implementation --- // // We also categorize nodes by their labels, but then we proceed differently: // // 1) A label may be marked "tied to parent". Let x, y have both label l and l is "tied to parent". // Then (x,y) can be in M only if (parent(x), parent(y)) in M. // Thus we require labels of children tied to a parent to be preceded by all their possible parent labels. // // 2) Rather than defining function equal in terms of constants f and t, which are hard to get right, // we try to match multiple times with different thresholds for node distance. // The comparer defines the distance [0..1] between two nodes and it can do so by analyzing // the node structure and value. The comparer can tune the distance specifically for each node kind. // We first try to match nodes of the same labels to the exactly matching or almost matching counterparts. // Then we keep increasing the threshold and keep adding matches. for (int label = 0; label < labelCount; label++) { if (nodes1[label] && nodes2[label]) { ComputeMatchForLabel(label, *nodes1[label], *nodes2[label]); } } } void ComputeMatchForLabel(int label, NodeList& s1, NodeList& s2) { int tiedToAncestor = comparer.TiedToAncestor(label); ComputeMatchForLabel(s1, s2, tiedToAncestor, EpsilonDistance); // almost exact match ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance1); // ok match ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance2); // ok match ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance3); // ok match ComputeMatchForLabel(s1, s2, tiedToAncestor, MaxDistance); // any match } void ComputeMatchForLabel(NodeList& s1, NodeList& s2, int tiedToAncestor, double maxAcceptableDistance) { // Obviously, the algorithm below is O(n^2). However, in the common case, the 2 lists will // be sequences that exactly match. The purpose of "firstNonMatch2" is to reduce the complexity // to O(n) in this case. Basically, the pointer is the 1st non-matched node in the list of nodes of tree2 // with the given label. // Whenever we match to firstNonMatch2 we set firstNonMatch2 to the subsequent node. // So in the case of totally matching sequences, we process them in O(n) - // both node1 and firstNonMatch2 will be advanced simultaneously. UnmatchedIterator i1(s1); for (;;) { PNode node1 = i1.GetNextUnmatched(); if (!node1) break; Assert(!HasPartnerInTree2(node1)); // Find node2 that matches node1 the best, i.e. has minimal distance. double bestDistance = MaxDistance; PNode bestMatch = nullptr; int bestMatchIndex = -1; // node1's best match index in list2 bool matched = false; UnmatchedIterator i2(s2); for (;;) { PNode node2 = i2.GetNextUnmatched(); if (!node2) break; Assert(!HasPartnerInTree1(node2)); // this requires parents to be processed before their children: if (tiedToAncestor > 0) { // TODO: For nodes tied to their parents, // consider avoiding matching them to all other nodes of the same label. // Rather we should only match them with their siblings that share the same parent. PNode ancestor1 = comparer.GetAncestor(node1, tiedToAncestor); PNode ancestor2 = comparer.GetAncestor(node2, tiedToAncestor); Assert(comparer.GetLabel(ancestor1) < comparer.GetLabel(node1)); if (!Contains(ancestor1, ancestor2)) { continue; } } // We know that // 1. (node1, node2) not in M // 2. Both of their parents are matched to the same parent (or are not matched) // // Now, we have no other choice than comparing the node "values" // and looking for the one with the smaller distance. // double distance = comparer.GetDistance(node1, node2); if (distance < bestDistance) { matched = true; bestMatch = node2; bestMatchIndex = i2.CurIndex(); bestDistance = distance; // We only stop if we've got an exact match. This is to resolve the problem // of entities with identical names(name is often used as the "value" of a // node) but with different "sub-values" (e.g. two locals may have the same name // but different types. Since the type is not part of the value, we don't want // to stop looking for the best match if we don't have an exact match). if (distance == ExactMatchDistance) { break; } } } if (matched && bestDistance <= maxAcceptableDistance) { Add(node1, bestMatch); i1.MarkCurrentMatched(); // i1's match is current node1 i2.MarkMatched(bestMatchIndex); // i2's match is one of the nodes examined in the above for(;;) pass } } } void Add(PNode node1, PNode node2) { Assert(comparer.TreesEqual(node1, root1)); Assert(comparer.TreesEqual(node2, root2)); oneToTwo->Add(node1, node2); twoToOne->Add(node2, node1); } // The customized Match algorithm iterates over the 2 node lists, compares every unmatched node pair to match nodes. // To find the next unmatched node, original algorithm iterates over every node in each list, use a dictionary lookup // to test if the node has been matched or not, until it sees next unmatched node. This could be very expensive if the // lists are huge. E.g., assume the only diff is inserting a new node at the beginning of list2. Then for each node in // list1, it checks every node starting from the beginning new node in list2 for next unmatched node. This results in // O(N^2) dictionary lookups. And we do 5 passes of these. // // To improve on this, we can try to record every match span and directly jump to next unmatched position. Note that // in both lists once a node is matched, the list entry is no longer used. We can reuse that space to record extra info. // * Original PNode pointer value must be at even address. The list item must have 0 at bit0 (lowest bit). // * Once a node is matched, mark 1 at bit0. With this we can get rid of dictionary lookup. // * Next, for each matched entry, use the upper bits to record "next" unmatched index. Try to maintain match span, // so that from a matched node we can directly jump to next unmatched index. // // This class is for above purpose. Expected call pattern: // * GetNextUnmatched, [MarkCurrentMatched], GetNextUnmatched, [MarkCurrentMatched], ... // -- (A) With first MarkCurrentMatched we know the start of a match span. // -- (B) Subsequent MarkCurrentMatched indicates continuous match span. // -- (C) When MarkCurrentMatched is not called for an entry, we know the end of a match span. Record the whole // span (A)->(C). If walked again we would directly jump from (A) to (C). // * Random MarkMatched(i) // -- We don't know the exact match span. Just mark this entry "i" as matched, but set its "next" (upper bits) to 0. // -- During next pass, we can merge all adjacent match spans and individual matched entries to bigger match spans. // This would help next pass (we have 5). // class UnmatchedIterator { private: NodeList& list; int lastMatched; // last matched node index. -1 means no known last matched index. int index; // current examining index. Only moved by GetNextUnmatched(). public: UnmatchedIterator(NodeList& list) : list(list), lastMatched(-1), index(-1) { VerifySize(list); } ~UnmatchedIterator() { // If we have lastMatched, we could have one of following: // * index is matched by MarkCurrentMatched(). Link lastMatched -> index (== lastMatched). GetNextUnmatched() can handle it. // * index remains unmatched (ends a matched sequence). Link lastMatched -> index. // * index is out of range. That means [lastMatched, ...end) are all matched. Link lastMatched -> index (out of range). // if (lastMatched >= 0) { SetNext(lastMatched, index); } } PNode GetNextUnmatched() { // If current ends a matched sequence, make a link [lastMatched -> current). if (lastMatched >= 0 && !IsMatched(index)) { SetNext(lastMatched, index); lastMatched = -1; } ++index; if (index < list.Count()) { if (IsMatched(index)) { if (lastMatched < 0) // Check if current starts a matched sequence { lastMatched = index; } // Jumps all matched span, until sees an unmatched entry or the end. int next; while (index < list.Count() && IsNext(list.Item(index), &next)) { index = max(next, index + 1); // Ensure moves forward (next could be 0, from individual MarkMatched() call). } } if (index < list.Count()) { return list.Item(index); } } return nullptr; } int CurIndex() const { return index; } void MarkMatched(int i) { if (i == index) { MarkCurrentMatched(); } else { SetMatched(i); } } void MarkCurrentMatched() { Assert(!IsMatched(index)); SetMatched(index); if (lastMatched < 0) // If current starts a matched sequence { lastMatched = index; } } private: static void VerifySize(const NodeList& list) { if (list.Count() > INT_MAX / 2) // Limit max size as we used bit0 { Math::DefaultOverflowPolicy(); } } template static void SetMatched(P& node) { SetNext(node, 0); } static bool IsMatched(PNode node) { return !!(reinterpret_cast(node) & 1); } template static void SetNext(P& node, int next) { UINT_PTR value = (static_cast(next) << 1) | 1; node = reinterpret_cast(value); } static bool IsNext(PNode node, _Out_ int* next) { UINT_PTR value = reinterpret_cast(node); if (value & 1) { *next = static_cast(value >> 1); return true; } return false; } void SetMatched(int i) { SetMatched(list.Item(i)); } bool IsMatched(int i) const { return IsMatched(list.Item(i)); } void SetNext(int i, int next) { SetNext(list.Item(i), next); } bool IsNext(int i, _Out_ int* next) const { return IsNext(list.Item(i), next); } }; }; template const double TreeMatch::ExactMatchDistance = TreeComparer::ExactMatchDistance; template const double TreeMatch::EpsilonDistance = TreeComparer::EpsilonDistance; template const double TreeMatch::MatchingDistance1 = 0.5; template const double TreeMatch::MatchingDistance2 = 1.0; template const double TreeMatch::MatchingDistance3 = 1.5; template const double TreeMatch::MaxDistance = 2.0; //----------------------------------------------------------------------------- // Represents an edit operation on a tree or a sequence of nodes. //----------------------------------------------------------------------------- template class Edit { private: EditKind kind; PNode node1; PNode node2; public: Edit() {} // // Insert nullptr NewNode // Delete OldNode nullptr // Move/Update OldNode NewNode // Edit(EditKind kind, PNode node1, PNode node2) : kind(kind), node1(node1), node2(node2) { Assert((node1 == nullptr) == (kind == EditKind::Insert)); Assert((node2 == nullptr) == (kind == EditKind::Delete)); } EditKind Kind() const { return kind; } PNode OldNode() const { return node1; } PNode NewNode() const { return node2; } }; //----------------------------------------------------------------------------- // Represents a sequence of tree edits. //----------------------------------------------------------------------------- template class EditScript { public: typedef TreeMatch TreeMatch; typedef typename TreeMatch::PNode PNode; typedef typename TreeMatch::NodeList NodeList; typedef typename TreeMatch::NodeMap NodeMap; typedef JsUtil::List, Allocator, /*isLeaf*/true> EditList; private: const TreeMatch& match; TreeComparer comparer; EditList edits; public: EditScript(Allocator* alloc, const TreeMatch& match) : match(match), comparer(match.Comparer()), edits(alloc) { AddUpdatesInsertsMoves(); AddDeletes(); } const EditList& Edits() const { return edits; } private: PNode Root1() const { return match.OldRoot(); } PNode Root2() const { return match.NewRoot(); } void AddUpdatesInsertsMoves() { // Breadth-first traversal. ProcessNode(Root2()); JsUtil::Queue queue(edits.GetAllocator()); queue.Enqueue(Root2()); while (!queue.Empty()) { PNode head = queue.Dequeue(); comparer.MapChildren(head, [&](PNode child) { ProcessNode(child); queue.Enqueue(child); }); } } void ProcessNode(PNode x) { Assert(comparer.TreesEqual(x, Root2())); // NOTE: // Our implementation differs from the algorithm described in the paper in following: // - We don't update M' and T1 since we don't need the final matching and the transformed tree. // - Insert and Move edits don't need to store the offset of the nodes relative to their parents, // so we don't calculate those. Thus we don't need to implement FindPos. // - We don't mark nodes "in order" since the marks are only needed by FindPos. // a) // Let x be the current node in the breadth-first search of T2. // Let y = parent(x). // Let z be the partner of parent(x) in M'. (note: we don't need z for insert) // // NOTE: // If we needed z then we would need to be updating M' as we encounter insertions. PNode w; bool hasPartner = match.TryGetPartnerInTree1(x, &w); PNode y; bool hasParent = comparer.TryGetParent(x, &y); if (!hasPartner) { // b) If x has no partner in M'. // i. k := FindPos(x) // ii. Append INS((w, a, value(x)), z, k) to E for a new identifier w. // iii. Add (w, x) to M' and apply INS((w, a, value(x)), z, k) to T1. edits.Add(Edit(EditKind::Insert, /*node1*/nullptr, /*node2*/x)); // NOTE: // We don't update M' here. } else if (hasParent) { // c) else if x is not a root // i. Let w be the partner of x in M', and let v = parent(w) in T1. PNode v = comparer.GetParent(w); // ii. if value(w) != value(x) // A. Append UPD(w, value(x)) to E // B. Apply UPD(w, value(x) to T1 // Let the Comparer decide whether an update should be added to the edit list. // The Comparer defines what changes in node values it cares about. if (!comparer.ValuesEqual(w, x)) { edits.Add(Edit(EditKind::Update, /*node1*/w, /*node2*/x)); } // If parents of w and x don't match, it's a move. // iii. if not (v, y) in M' // NOTE: The paper says (y, v) but that seems wrong since M': T1 -> T2 and w,v in T1 and x,y in T2. if (!match.Contains(v, y)) { // A. Let z be the partner of y in M'. (NOTE: z not needed) // B. k := FindPos(x) // C. Append MOV(w, z, k) // D. Apply MOV(w, z, k) to T1 edits.Add(Edit(EditKind::Move, /*node1*/w, /*node2*/x)); } } // d) AlignChildren(w, x) // NOTE: If we just applied an INS((w, a, value(x)), z, k) operation on tree T1 // the newly created node w would have no children. So there is nothing to align. if (hasPartner) { AlignChildren(w, x); } } void AddDeletes() { // 3. Do a post-order traversal of T1. // a) Let w be the current node in the post-order traversal of T1. // b) If w has no partner in M' then append DEL(w) to E and apply DEL(w) to T1. // // NOTE: The fact that we haven't updated M' during the Insert phase // doesn't affect Delete phase. The original algorithm inserted new node n1 into T1 // when an insertion INS(n1, n2) was detected. It also added (n1, n2) to M'. // Then in Delete phase n1 is visited but nothing is done since it has a partner n2 in M'. // Since we don't add n1 into T1, not adding (n1, n2) to M' doesn't affect the Delete phase. comparer.MapDescendants(Root1(), [&](PNode w) { if (!match.HasPartnerInTree2(w)) { edits.Add(Edit(EditKind::Delete, /*node1*/w, /*node2*/nullptr)); } }); } void AlignChildren(PNode w, PNode x) { Assert(comparer.TreesEqual(w, Root1())); Assert(comparer.TreesEqual(x, Root2())); Allocator* alloc = edits.GetAllocator(); // Step 1 // Make all children of w and and all children x "out of order" // NOTE: We don't need to mark nodes "in order". // Step 2 // Let S1 be the sequence of children of w whose partner are children // of x and let S2 be the sequence of children of x whose partner are // children of w. NodeList s1(alloc), s2(alloc); if (!TryGetMatchedChildren(s1, w, x, [&](PNode e, PNode* partner) { return match.TryGetPartnerInTree2(e, partner); }) || !TryGetMatchedChildren(s2, x, w, [&](PNode e, PNode* partner) { return match.TryGetPartnerInTree1(e, partner); })) { return; } // Step 3, 4 // Define the function Equal(a,b) to be true if and only if (a,b) in M' // Let S <- LCS(S1, S2, Equal) NodeMap s(alloc); { LongestCommonSubsequence lcs(alloc, s1.Count(), s2.Count(), [&](int indexA, int indexB) { return match.Contains(s1.Item(indexA), s2.Item(indexB)); }); lcs.MapMatches([&](int indexA, int indexB) { s.AddNew(s1.Item(indexA), s2.Item(indexB)); }); } // Step 5 // For each (a,b) in S, mark nodes a and b "in order" // NOTE: We don't need to mark nodes "in order". // Step 6 // For each a in S1, b in S2 such that (a,b) in M but (a,b) not in S // (a) k <- FindPos(b) // (b) Append MOV(a,w,k) to E and apply MOV(a,w,k) to T1 // (c) Mark a and b "in order" // NOTE: We don't mark nodes "in order". s1.Map([&](int index, PNode a) { PNode b; if (match.TryGetPartnerInTree2(a, &b) // (a,b) in M && comparer.GetParent(b) == x // => b in S2 since S2 == { b | parent(b) == x && parent(partner(b)) == w } && !ContainsPair(s, a, b)) // (a,b) not in S { Assert(comparer.TreesEqual(a, Root1())); Assert(comparer.TreesEqual(b, Root2())); edits.Add(Edit(EditKind::Reorder, /*node1*/a, /*node2*/b)); } }); } // Helper: Get the sequence of children of x whose partner are children of y. template bool TryGetMatchedChildren(NodeList& nodes, PNode x, PNode y, const TryGetPartnerFunc& tryGetPartner) { Assert(nodes.Empty()); comparer.MapChildren(x, [&](PNode e) { PNode partner; if (tryGetPartner(e, &partner) && comparer.GetParent(partner) == y) { nodes.Add(e); } }); return !nodes.Empty(); } static bool ContainsPair(const NodeMap& dict, PNode a, PNode b) { PNode value; return dict.TryGetValue(a, &value) && value == b; } };