Skip to content

Commit

Permalink
Parse MediaWiki dump XML and store page names in the index
Browse files Browse the repository at this point in the history
This task is mostly bound by the network speed, so the fast that quick-xml
might be slower of faster than awk is irrelevant. Further benchmarks might
be needed later on if the mediawiki dumps are loaded locally, such as on
Toolforge.

Note: page names alone for frwiki account for 326 Mo uncompressed, or 102 Mo
compressed with gzip --best. It's probably a good idea to compress using zstd
at build time and to decompress in RAM at query time (< 2 s with gzip).
  • Loading branch information
Arkanosis committed May 4, 2020
1 parent 5bbafae commit 60757c8
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
target
*.bak
*.idx
17 changes: 17 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ categories = ["command-line-utilities"]
edition = "2018"

[dependencies]
byteorder = "1.3"
docopt = "1.1"
quick-xml = "0.18"
serde = "1.0"
serde_derive = "1.0"
81 changes: 77 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,91 @@
use std::io::Read;
use byteorder::WriteBytesExt;

use quick_xml::{
Reader,
events::Event,
};

use std::io::{
BufRead,
Write,
};

enum Tag {
Title,
UserName,
Other,
}

pub fn version() -> &'static str {
return option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
}

pub fn build(reader: &mut dyn Read, index: String) -> Result<(), ()> {
println!("building index '{}'", index);
unimplemented!();
pub fn build(reader: &mut dyn BufRead, writer: &mut dyn Write) -> Result<(), ()> {
let mut xml_reader = Reader::from_reader(reader);
let mut buffer = Vec::new();
let mut current_tag = Tag::Other;
loop {
match xml_reader.read_event(&mut buffer) {
Ok(Event::Start(ref event)) => {
match event.name() {
b"title" => current_tag = Tag::Title,
b"username" => current_tag = Tag::UserName,
_ => current_tag = Tag::Other,
}
},
Ok(Event::End(_)) => current_tag = Tag::Other,
Ok(Event::Text(event)) => {
match current_tag {
Tag::Title => {
print!("page: '{}'\n", event.unescape_and_decode(&xml_reader).unwrap());
match event.unescaped() {
Ok(ref buffer) => {
// TODO keep previous offset for index
writer.write_all(buffer).unwrap();
writer.write_u8(0).unwrap();
}
Err(_) => (), // ignore encoding error in the dump
}
}
Tag::UserName => {
print!("\tuser: '{}'\n", event.unescape_and_decode(&xml_reader).unwrap());
// TODO add user to map if not present, with an empty u32 list
// TODO add previous page name offset to user list
},
Tag::Other => (),
}
},
Err(error) => panic!("XML parsing error at position {}: {:?}", xml_reader.buffer_position(), error),
Ok(Event::Eof) => break,
_ => (),
}
buffer.clear();
}
// TODO for each user (alphabetically)
// TODO keep previous offset for index
// TODO write page offsets list length
// TODO write page offsets list
// TODO keep previous offset for header
// TODO write username -> user offset mapping as FST
// TODO write FST offset as u32
Ok(())
}

pub fn query(index: String, users: &Vec<String>) -> Result<(), ()> {
println!("querying users on index '{}':", index);
// TODO read last u32 -> offset of the FST
// TODO mmap FST
for user in users {
println!("\t{}", user);
// TODO lookup user in FST -> page offsets list offset for that user
// TODO add (username, offset) to list
}
unimplemented!();
// TODO heap merge of all lists
// TODO while ! heap.is_empty()
// TODO while heap.peek() is same
// TODO keep users
// TODO update co-occurrence matrix
// TODO write CSV line with page name, number of users, user names
// TODO write CSV co-occurence matrix
}
11 changes: 10 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
use std::io::BufWriter;

use serde_derive::Deserialize;

use std::fs::File;

const USAGE: &str = "
Usage: socksfinder build <index>
socksfinder query <index> <user>...
Expand Down Expand Up @@ -43,7 +47,12 @@ fn main() {
println!("socksfinder v{}", socksfinder::version());
} else {
let result = if args.cmd_build {
socksfinder::build(&mut std::io::stdin(), args.arg_index)
let output = File::create(&args.arg_index).unwrap_or_else(|cause| {
println!("socksfinder: can't open index: {}: {}", &args.arg_index, &cause);
std::process::exit(1);
});
let mut buffered_output = BufWriter::new(output);
socksfinder::build(&mut std::io::stdin().lock(), &mut buffered_output)
} else if args.cmd_query {
socksfinder::query(args.arg_index, &args.arg_user)
} else {
Expand Down
7 changes: 7 additions & 0 deletions tests/fast-build-from-latest-frwiki-dump-and-query.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#! /bin/sh

BASEDIR="$(dirname "$(basename "$(readlink -f "$0")")")"

curl "https://dumps.wikimedia.org/frwiki/latest/frwiki-latest-stub-meta-history.xml.gz" |
gunzip |
"$BASEDIR/target/release/socksfinder" build "frwiki-latest.idx" > /dev/null

0 comments on commit 60757c8

Please sign in to comment.