From 2a1e105f7a22a2c3258af40698e2be3e34245abd Mon Sep 17 00:00:00 2001 From: Daniela Brozzoni Date: Wed, 8 Jan 2025 18:15:22 +0100 Subject: [PATCH] Add GETADDR scraping script --- getaddr-script/.gitignore | 3 + getaddr-script/README.md | 58 +++++++++++++++ getaddr-script/compare.py | 122 ++++++++++++++++++++++++++++++++ getaddr-script/getaddr.py | 104 +++++++++++++++++++++++++++ getaddr-script/requirements.txt | 2 + src/net_processing.cpp | 4 ++ 6 files changed, 293 insertions(+) create mode 100644 getaddr-script/.gitignore create mode 100644 getaddr-script/README.md create mode 100644 getaddr-script/compare.py create mode 100644 getaddr-script/getaddr.py create mode 100644 getaddr-script/requirements.txt diff --git a/getaddr-script/.gitignore b/getaddr-script/.gitignore new file mode 100644 index 0000000000000..2884f80bb4743 --- /dev/null +++ b/getaddr-script/.gitignore @@ -0,0 +1,3 @@ +data/ +plot.png +.venv diff --git a/getaddr-script/README.md b/getaddr-script/README.md new file mode 100644 index 0000000000000..e534ea605b199 --- /dev/null +++ b/getaddr-script/README.md @@ -0,0 +1,58 @@ +# Bitcoin Peer Address Analysis Scripts +## Overview + +This directory contains two Python scripts designed to interact with Bitcoin Core: + +- getaddr.py: Sends a GETADDR message to Bitcoin peers specified in your Bitcoin Core configuration and extracts the received ADDR messages from the debug logs. +- compare_and_plot.py: Compares the data retrieved by getaddr.py and visualizes overlaps between peer address buckets. + +## Prerequisites + +- Bitcoin Core: You must compile Bitcoin Core from the current branch. This branch includes modifications to Bitcoin Core's source code to log `GETADDR` responses. +- Python 3.x: Ensure Python 3.x is installed, check requirements in requirements.txt + +## First-Time Configuration +### 1. Compile Bitcoin Core + +To retrieve GETADDR responses, you must compile Bitcoin Core with this branch code. + +### 2. Configure bitcoin.conf + +In your bitcoin.conf file, use the connect= directive to specify the nodes you want to connect to. These nodes will be the only ones the script interacts with. For example: + +``` +connect=node1.example.com +connect=123.45.67.89 +``` + +This ensures the script connects only to the specified peers and retrieves data exclusively from them. + +### 3. Update Script Variables + +Update the following variables in the scripts as needed: +In `getaddr.py`: +- `BITCOIN_CLI`: Path to the bitcoin-cli executable. +- `PARENT_DIR`: Directory to store output data +- `LOG_PATH`: Path of the Bitcoin Core log file + +In `compare.py`: +- DIRECTORY: Path to the folder containing CSV files + +## Workflow +### Before Running the Scripts + +Prepare Your Environment: +- Compile Bitcoin Core +- Add peer nodes in bitcoin.conf using connect= +- Clean the debug.log file to avoid processing old data +- Start bitcoind and ensure peers specified in the configuration file are connected + +Run the Scripts: +- getaddr.py: + - Sends GETADDR messages to the connected peers + - Extracts the resulting ADDR messages from debug.log + - Outputs data into a timestamped folder in the specified directory +- compare_and_plot.py: + - Processes the CSV files generated by getaddr.py + - Compares overlaps between peer address buckets + - Generates a bar chart (`node_address_matches.png`) diff --git a/getaddr-script/compare.py b/getaddr-script/compare.py new file mode 100644 index 0000000000000..372cf37f885de --- /dev/null +++ b/getaddr-script/compare.py @@ -0,0 +1,122 @@ +import os +from itertools import combinations + +# This script compares data from CSV files generated by getaddr.py, calculates overlaps +# between peer address buckets, and visualizes the results in a bar chart saved in node_address_matches.png. +# +# Prerequisites: +# - Matplotlib and NumPy installed +# - CSV files generated by getaddr.py located in the specified `directory` +# +# Before running: +# Change `directory` match your data directory (see below TODO) + + +# TODO: change me if necessary +BASE_DIRECTORY = "/home/daniela/Developer/bitcoin/getaddr-script/data" + + +def retrieve_and_clean_data(directory): + """ + Reads CSV files from the specified directory, parses peer address data, + and organizes it into a dictionary. + + Returns: + dict: A dictionary where keys are filenames (cleaned) and values are dictionaries + mapping addresses to their respective timestamps and services. + Example: + { + 'filename1': { + 'addr1': ('timestamp1', 'services1'), + 'addr2': ('timestamp2', 'services2'), + }, + 'filename2': { ... } + } + """ + buckets = {} + for filename in os.listdir(directory): + if not filename.endswith(".csv"): + continue + bucket = {} + file_path = os.path.join(directory, filename) + print(file_path) + with open(file_path, "r") as file: + for line in file: + #print(line) + parsed_line = line[1:-1].split(", ")[1:] + bucket[parsed_line[0]] = (parsed_line[1], parsed_line[2]) + clean_filename = filename.split("-")[1][:-4] + buckets[clean_filename] = bucket + return buckets + + +# Compares two address buckets and calculates the number of perfect and partial matches. +def check_overlap(bucket_a, bucket_b): + print(f"Bucket A contains {len(bucket_a)} values") + print(f"Bucket B contains {len(bucket_b)} values") + perfect_matches = 0 + partial_matches = 0 + for address, (timestamp, services) in bucket_a.items(): + if address in bucket_b: + if bucket_b[address] == (timestamp, services): + perfect_matches += 1 + else: + partial_matches += 1 + if len(bucket_a) != 0 and len(bucket_b) != 0: + print(f"Perfect matches: {perfect_matches} ({round(perfect_matches * 100 / len(bucket_a), 2)}% A, {round(perfect_matches * 100 / len(bucket_b), 2)}% B)") + print(f"Partial matches: {partial_matches} ({round(partial_matches * 100 / len(bucket_a), 2)}% A, {round(partial_matches * 100 / len(bucket_b), 2)}% B)") + else: + print("Empty bucket") + return (perfect_matches, partial_matches) + + +def plot(nodes, matches, directory): + import matplotlib.pyplot as plt + import numpy as np + x = np.arange(len(nodes)) # the label locations + width = 0.25 # the width of the bars + multiplier = 0 + + fig, ax = plt.subplots(layout='constrained') + + for attribute, measurement in matches.items(): + offset = width * multiplier + rects = ax.bar(x + offset, measurement, width, label=attribute) + ax.bar_label(rects, padding=3) + multiplier += 1 + + # Add some text for labels, title and custom x-axis tick labels, etc. + ax.set_title("Comparison of Address Matches Between Nodes") + ax.set_xlabel('Node Pairs') + ax.set_ylabel('Number of Address Matches') + ax.set_xticks(x + width, nodes) + ax.set_xticklabels(nodes, rotation=45, ha='right') + ax.legend(loc='upper left', ncols=3) + + plt.savefig(os.path.join(directory, "node_address_matches.png")) + + +if __name__ == "__main__": + dir = input('Enter the subdirectory name (within BASE_DIRECTORY) where the data is stored: ') + directory = os.path.join(BASE_DIRECTORY, dir) + print(f'Looking for data in {directory}...') + buckets = retrieve_and_clean_data(directory) + nodes = [] + matches = { + 'perfect_match': [], + 'partial_match': [], + } + for pair in combinations(buckets, 2): + print(f"Comparing {pair[0]} and {pair[1]}") + + # Shortens a node name for display in plots. + def node_label(name): + if len(name) < 15: + return name + return f"{name[:5]}..{name[-5:]}" + + nodes.append(f"{node_label(pair[0])} - {node_label(pair[1])}") + result = check_overlap(buckets[pair[0]], buckets[pair[1]]) + matches['perfect_match'].append(result[0]) + matches['partial_match'].append(result[1]) + plot(nodes, matches, directory) diff --git a/getaddr-script/getaddr.py b/getaddr-script/getaddr.py new file mode 100644 index 0000000000000..1cc46b03dd49f --- /dev/null +++ b/getaddr-script/getaddr.py @@ -0,0 +1,104 @@ +import datetime +import os +import subprocess +import time +import json + +# This script sends a GETADDR message to every connected peer and extracts +# received ADDR messages from the debug.log. It outputs the results into CSV files +# +# Requirements: +# - Ensure `bitcoin-cli` is compiled and available at the specified path. +# - Debugging enabled in `bitcoind` configuration. +# - Python 3.x with `subprocess` and `json` modules. + +# First time configuration: +# Change parent_dir to match your bitcoin directory (see #TODO below) +# Add to the bitcoin core configuration the node addresses, using connect= + +# Before running script: +# Clean debug.log file +# Recompile bitcoind (if there's any changes) +# Start bitcoind and i2pd, bitcoind configuration contains addresses with connect= +# Wait for peers to connect + +# TODO: these two variables are hardcoded, change if necessary +BITCOIN_CLI = "../build/src/bitcoin-cli" +LOG_PATH = "/home/daniela/.bitcoin/debug.log" +PARENT_DIR = "/home/daniela/Developer/bitcoin/getaddr-script/data" + +peers = {} +now = datetime.datetime.now() +dir = now.strftime("%Y-%m-%d_%H-%M-%S") + +path = os.path.join(PARENT_DIR, dir) +os.makedirs(path, exist_ok=True) +print(f'Created data directory {path}') + +# Check if the Bitcoin node is running before starting operations. +retries = 10 +while retries > 0: + try: + result = subprocess.run([BITCOIN_CLI, "getblockchaininfo"], capture_output=True, check=True) + break + except subprocess.CalledProcessError as e: + print(e.stderr) + time.sleep(1) + retries -= 1 +if retries == 0: + raise RuntimeError("Bitcoin Core is not running. Exiting...") + +print("Bitcoin Core is ok, ready to start") + +getpeerinfo_result = subprocess.run([BITCOIN_CLI, "getpeerinfo"], capture_output=True, check=True) +getpeerinfo = json.loads(getpeerinfo_result.stdout) + +# Send GETADDR messages to each peer. Retry up to 3 times if sending fails. +for peer in getpeerinfo: + peer_id = peer['id'] + peer_addr = peer['addr'] + print(f"Sending GETADDR to peer {peer_addr}, id {peer_id}...") + tries = 0 + while True: + try: + tries += 1 + if tries == 3: + print(f"Error: Failed to send GETADDR to peer {peer_addr} after 3 attempts. Skipping...") + break + subprocess.run([BITCOIN_CLI, "sendmsgtopeer", f"{peer_id}", "getaddr", ""], capture_output=True, check=True) + print("Sent!") + break + except: + print(f"Error: Couldn't send GETADDR to peer {peer_addr}. Retrying...") + time.sleep(10) + peers[peer_id] = peer_addr + +print("Gathering responses...") +time.sleep(60) + +# Extract relevant log data for each peer and save it in a CSV file. +try: + filtered_lines = [] + + # Step 1: Filter the log file for lines containing "GETADDR SCRIPT" + with open(LOG_PATH, "r") as log_file: + for line in log_file: + if "GETADDR SCRIPT" in line: + filtered_lines.append(line) + + # Step 2: Separate filtered lines by peer addresses + peer_matches = {peer_id: [] for peer_id in peers} + for line in filtered_lines: + for peer_id, peer_addr in peers.items(): + if f"GETADDR SCRIPT{peer_addr}" in line: + peer_matches[peer_id].append(line) + + # Step 3: Save matching lines for each peer to individual CSV files + for peer_id, peer_addr in peers.items(): + file_name = f"getaddr-{peer_addr}.csv" + print(f"Saving matches for peer {peer_addr} to file: {file_name}") + with open(os.path.join(path, file_name), "w") as output_file: + output_file.writelines(peer_matches[peer_id]) + +except Exception as e: + print(f"Error processing log file: {e}") diff --git a/getaddr-script/requirements.txt b/getaddr-script/requirements.txt new file mode 100644 index 0000000000000..db5d81e01ea66 --- /dev/null +++ b/getaddr-script/requirements.txt @@ -0,0 +1,2 @@ +matplotlib +numpy diff --git a/src/net_processing.cpp b/src/net_processing.cpp index a19443c0f5615..b431f389b9023 100644 --- a/src/net_processing.cpp +++ b/src/net_processing.cpp @@ -3798,6 +3798,10 @@ void PeerManagerImpl::ProcessMessage(CNode& pfrom, const std::string& msg_type, vRecv >> ser_params(vAddr); + for (CAddress& addr: vAddr) { + LogDebug(BCLog::NET, "GETADDR SCRIPT%s, %s, %d, %d", pfrom.addr.ToStringAddr(), addr.ToStringAddrPort(), addr.nTime.time_since_epoch(), addr.nServices); + } + if (!SetupAddressRelay(pfrom, *peer)) { LogDebug(BCLog::NET, "ignoring %s message from %s peer=%d\n", msg_type, pfrom.ConnectionTypeAsString(), pfrom.GetId()); return;