use a strings.Replacer to reverse names in internal/abi

This way, rather than using a double loop quadratic algorithm to search for each name to replace in a string, we can make use of the reasonably efficient generic replacer which makes use of tries. I haven't measured whether this is noticeably better or worse than the previous implementation, and we don't have any benchmarks for it, but it should be reasonable to assume that a very basic double loop is going to be slower for the larger builds with thousands of names. Copying some code from the strings package is not ideal, but it beats having to re-implement such an algorithm ourselves.
burrowers · Nov 27, 2024 · bbbd316 · bbbd316
1 parent 926f3de
commit bbbd316
Showing 1 changed file with 266 additions and 44 deletions.
diff --git a/reflect_abi_patch.go b/reflect_abi_patch.go
@@ -16,21 +16,25 @@ func abiNamePatch(path string) (string, error) {
 	}
 
 	find := `return unsafe.String(n.DataChecked(1+i, "non-empty string"), l)`
-	replace := `return _realName(unsafe.String(n.DataChecked(1+i, "non-empty string"), l))`
+	replace := `return _originalNames(unsafe.String(n.DataChecked(1+i, "non-empty string"), l))`
 
 	str := strings.Replace(string(data), find, replace, 1)
 
-	realname := `
-//go:linkname _realName
-func _realName(name string) string
-`
+	originalNames := `
+//go:linkname _originalNames
+func _originalNames(name string) string
+
+//go:linkname _originalNamesInit
+func _originalNamesInit()
 
-	return str + realname, nil
+func init() { _originalNamesInit() }
+`
+	return str + originalNames, nil
 }
 
 var reflectPatchFile = ""
 
-// reflectMainPrePatch adds the initial empty name mapping and _realName implementation
+// reflectMainPrePatch adds the initial empty name mapping and _originalNames implementation
 // to a file in the main package. The name mapping will be populated later after
 // analyzing the main package, since we need to know all obfuscated names that need mapping.
 // We split this into pre/post steps so that all variable names in the generated code
@@ -47,64 +51,282 @@ func reflectMainPrePatch(path string) ([]byte, error) {
 		return nil, err
 	}
 
-	nameMap := "\nvar _nameMap = map[string]string{}"
+	namePairs := "\nvar _namePairs = []string{}"
 
-	return append(content, []byte(realNameCode+nameMap)...), nil
+	return append(content, []byte(originalNamesCode+namePairs)...), nil
 }
 
 // reflectMainPostPatch populates the name mapping with the final obfuscated->real name
 // mappings after all packages have been analyzed.
 func reflectMainPostPatch(file []byte, lpkg *listedPackage, pkg pkgCache) []byte {
-	obfMapName := hashWithPackage(lpkg, "_nameMap")
-	nameMap := fmt.Sprintf("%s = map[string]string{", obfMapName)
+	obfMapName := hashWithPackage(lpkg, "_namePairs")
+	namePairs := fmt.Appendf(nil, "%s = []string{", obfMapName)
 
-	var b strings.Builder
+	namePairsFilled := bytes.Clone(namePairs)
 	keys := slices.Sorted(maps.Keys(pkg.ReflectObjectNames))
 	for _, obf := range keys {
-		b.WriteString(fmt.Sprintf(`"%s": "%s",`, obf, pkg.ReflectObjectNames[obf]))
+		namePairsFilled = fmt.Appendf(namePairsFilled, `%q, %q,`, obf, pkg.ReflectObjectNames[obf])
 	}
 
-	return bytes.Replace(file, []byte(nameMap), []byte(nameMap+b.String()), 1)
+	return bytes.Replace(file, namePairs, namePairsFilled, 1)
 }
 
 // The "name" internal/abi passes to this function doesn't have to be a simple "someName"
-// it can also be for function names:
-// "*pkgName.FuncName" (obfuscated)
-// or for structs the entire struct definition:
-// "*struct { AQ45rr68K string; ipq5aQSIqN string; hNfiW5O5LVq struct { gPTbGR00hu string } }"
+// it can also be for function names like "*pkgName.FuncName" (obfuscated)
+// or for structs the entire struct definition, like
 //
-// Therefore all obfuscated names which occur within name need to be replaced with their "real" equivalents.
+//	*struct { AQ45rr68K string; ipq5aQSIqN string; hNfiW5O5LVq struct { gPTbGR00hu string } }
 //
-// The code below does a more efficient version of:
+// Therefore all obfuscated names which occur within name need to be replaced with their original equivalents.
+// We make use of a trimmed down strings.Replacer, which uses a reasonably efficient algorithm
+// which we can prepare ahead of time at init time.
+// Note that we must initialize the replacer when internal/abi is initialized, not when main is initialized,
+// hence a second linkname function to plumb that through.
+const originalNamesCode = `
+var _originalNamesReplacer *_genericReplacer
+
+//go:linkname _originalNamesInit internal/abi._originalNamesInit
+func _originalNamesInit() {
+	_originalNamesReplacer = _makeGenericReplacer(_namePairs)
+}
+
+//go:linkname _originalNames internal/abi._originalNames
+func _originalNames(name string) string {
+	return _originalNamesReplacer.Replace(name)
+}
+
+// -- Lifted from internal/stringslite --
+
+func _hasPrefix(s, prefix string) bool {
+	return len(s) >= len(prefix) && s[0:len(prefix)] == prefix
+}
+
+// -- Lifted from strings as of Go 1.23 --
 //
-//	func _realName(name string) string {
-//			for obfName, real := range _nameMap {
-//				name = strings.ReplaceAll(name, obfName, real)
-//			}
+// With minor modifications to avoid type assertions,
+// as any reflection in internal/abi causes a recursive call to the runtime
+// which locks up the entire runtime. Moreover, we can't import strings.
 //
-//			return name
-//	}
-const realNameCode = `
-//go:linkname _realName internal/abi._realName
-func _realName(name string) string {
-	for i := 0; i < len(name); {
-		remLen := len(name[i:])
-		found := false
-		for obfName, real := range _nameMap {
-			keyLen := len(obfName)
-			if keyLen > remLen {
-				continue
+// Updating the code below should not be necessary in general,
+// unless upstream Go makes significant improvements to this replacer implementation.
+
+// _trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
+// and values may be empty. For example, the trie containing keys "ax", "ay",
+// "bcbc", "x" and "xy" could have eight nodes:
+//
+//	n0  -
+//	n1  a-
+//	n2  .x+
+//	n3  .y+
+//	n4  b-
+//	n5  .cbc+
+//	n6  x+
+//	n7  .y+
+//
+// n0 is the root node, and its children are n1, n4 and n6; n1's children are
+// n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
+// with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
+// (marked with a trailing "+") are complete keys.
+type _trieNode struct {
+	// value is the value of the trie node's key/value pair. It is empty if
+	// this node is not a complete key.
+	value string
+	// priority is the priority (higher is more important) of the trie node's
+	// key/value pair; keys are not necessarily matched shortest- or longest-
+	// first. Priority is positive if this node is a complete key, and zero
+	// otherwise. In the example above, positive/zero priorities are marked
+	// with a trailing "+" or "-".
+	priority int
+
+	// A trie node may have zero, one or more child nodes:
+	//  * if the remaining fields are zero, there are no children.
+	//  * if prefix and next are non-zero, there is one child in next.
+	//  * if table is non-zero, it defines all the children.
+	//
+	// Prefixes are preferred over tables when there is one child, but the
+	// root node always uses a table for lookup efficiency.
+
+	// prefix is the difference in keys between this trie node and the next.
+	// In the example above, node n4 has prefix "cbc" and n4's next node is n5.
+	// Node n5 has no children and so has zero prefix, next and table fields.
+	prefix string
+	next   *_trieNode
+
+	// table is a lookup table indexed by the next byte in the key, after
+	// remapping that byte through _genericReplacer.mapping to create a dense
+	// index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
+	// 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
+	// _genericReplacer.tableSize will be 5. Node n0's table will be
+	// []*_trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped
+	// 'a', 'b' and 'x'.
+	table []*_trieNode
+}
+
+func (t *_trieNode) add(key, val string, priority int, r *_genericReplacer) {
+	if key == "" {
+		if t.priority == 0 {
+			t.value = val
+			t.priority = priority
+		}
+		return
+	}
+
+	if t.prefix != "" {
+		var n int // length of the longest common prefix
+		for ; n < len(t.prefix) && n < len(key); n++ {
+			if t.prefix[n] != key[n] {
+				break
+			}
+		}
+		if n == len(t.prefix) {
+			t.next.add(key[n:], val, priority, r)
+		} else if n == 0 {
+			var prefixNode *_trieNode
+			if len(t.prefix) == 1 {
+				prefixNode = t.next
+			} else {
+				prefixNode = &_trieNode{
+					prefix: t.prefix[1:],
+					next:   t.next,
+				}
 			}
-			if name[i:i+keyLen] == obfName {
-				name = name[:i] + real + name[i+keyLen:]
-				found = true
-				i += len(real)
+			keyNode := new(_trieNode)
+			t.table = make([]*_trieNode, r.tableSize)
+			t.table[r.mapping[t.prefix[0]]] = prefixNode
+			t.table[r.mapping[key[0]]] = keyNode
+			t.prefix = ""
+			t.next = nil
+			keyNode.add(key[1:], val, priority, r)
+		} else {
+			// Insert new node after the common section of the prefix.
+			next := &_trieNode{
+				prefix: t.prefix[n:],
+				next:   t.next,
+			}
+			t.prefix = t.prefix[:n]
+			t.next = next
+			next.add(key[n:], val, priority, r)
+		}
+	} else if t.table != nil {
+		// Insert into existing table.
+		m := r.mapping[key[0]]
+		if t.table[m] == nil {
+			t.table[m] = new(_trieNode)
+		}
+		t.table[m].add(key[1:], val, priority, r)
+	} else {
+		t.prefix = key
+		t.next = new(_trieNode)
+		t.next.add("", val, priority, r)
+	}
+}
+
+func (r *_genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) {
+	// Iterate down the trie to the end, and grab the value and keylen with
+	// the highest priority.
+	bestPriority := 0
+	node := &r.root
+	n := 0
+	for node != nil {
+		if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
+			bestPriority = node.priority
+			val = node.value
+			keylen = n
+			found = true
+		}
+
+		if s == "" {
+			break
+		}
+		if node.table != nil {
+			index := r.mapping[s[0]]
+			if int(index) == r.tableSize {
 				break
 			}
+			node = node.table[index]
+			s = s[1:]
+			n++
+		} else if node.prefix != "" && _hasPrefix(s, node.prefix) {
+			n += len(node.prefix)
+			s = s[len(node.prefix):]
+			node = node.next
+		} else {
+			break
 		}
-		if !found {
-			i++
+	}
+	return
+}
+
+type _genericReplacer struct {
+	root _trieNode
+	// tableSize is the size of a trie node's lookup table. It is the number
+	// of unique key bytes.
+	tableSize int
+	// mapping maps from key bytes to a dense index for _trieNode.table.
+	mapping [256]byte
+}
+
+func _makeGenericReplacer(oldnew []string) *_genericReplacer {
+	r := new(_genericReplacer)
+	// Find each byte used, then assign them each an index.
+	for i := 0; i < len(oldnew); i += 2 {
+		key := oldnew[i]
+		for j := 0; j < len(key); j++ {
+			r.mapping[key[j]] = 1
 		}
 	}
-	return name
-}`
+
+	for _, b := range r.mapping {
+		r.tableSize += int(b)
+	}
+
+	var index byte
+	for i, b := range r.mapping {
+		if b == 0 {
+			r.mapping[i] = byte(r.tableSize)
+		} else {
+			r.mapping[i] = index
+			index++
+		}
+	}
+	// Find each byte used, then assign them each an index.
+	r.root.table = make([]*_trieNode, r.tableSize)
+
+	for i := 0; i < len(oldnew); i += 2 {
+		r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
+	}
+	return r
+}
+
+func (r *_genericReplacer) Replace(s string) string {
+	dst := make([]byte, 0, len(s))
+	var last int
+	var prevMatchEmpty bool
+	for i := 0; i <= len(s); {
+		// Fast path: s[i] is not a prefix of any pattern.
+		if i != len(s) && r.root.priority == 0 {
+			index := int(r.mapping[s[i]])
+			if index == r.tableSize || r.root.table[index] == nil {
+				i++
+				continue
+			}
+		}
+
+		// Ignore the empty match iff the previous loop found the empty match.
+		val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
+		prevMatchEmpty = match && keylen == 0
+		if match {
+			dst = append(dst, s[last:i]...)
+			dst = append(dst, val...)
+			i += keylen
+			last = i
+			continue
+		}
+		i++
+	}
+	if last != len(s) {
+		dst = append(dst, s[last:]...)
+	}
+	return string(dst)
+}
+`