-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Parse MediaWiki dump XML and store page names in the index
This task is mostly bound by the network speed, so the fast that quick-xml might be slower of faster than awk is irrelevant. Further benchmarks might be needed later on if the mediawiki dumps are loaded locally, such as on Toolforge. Note: page names alone for frwiki account for 326 Mo uncompressed, or 102 Mo compressed with gzip --best. It's probably a good idea to compress using zstd at build time and to decompress in RAM at query time (< 2 s with gzip).
- Loading branch information
Showing
6 changed files
with
114 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
target | ||
*.bak | ||
*.idx |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,91 @@ | ||
use std::io::Read; | ||
use byteorder::WriteBytesExt; | ||
|
||
use quick_xml::{ | ||
Reader, | ||
events::Event, | ||
}; | ||
|
||
use std::io::{ | ||
BufRead, | ||
Write, | ||
}; | ||
|
||
enum Tag { | ||
Title, | ||
UserName, | ||
Other, | ||
} | ||
|
||
pub fn version() -> &'static str { | ||
return option_env!("CARGO_PKG_VERSION").unwrap_or("unknown"); | ||
} | ||
|
||
pub fn build(reader: &mut dyn Read, index: String) -> Result<(), ()> { | ||
println!("building index '{}'", index); | ||
unimplemented!(); | ||
pub fn build(reader: &mut dyn BufRead, writer: &mut dyn Write) -> Result<(), ()> { | ||
let mut xml_reader = Reader::from_reader(reader); | ||
let mut buffer = Vec::new(); | ||
let mut current_tag = Tag::Other; | ||
loop { | ||
match xml_reader.read_event(&mut buffer) { | ||
Ok(Event::Start(ref event)) => { | ||
match event.name() { | ||
b"title" => current_tag = Tag::Title, | ||
b"username" => current_tag = Tag::UserName, | ||
_ => current_tag = Tag::Other, | ||
} | ||
}, | ||
Ok(Event::End(_)) => current_tag = Tag::Other, | ||
Ok(Event::Text(event)) => { | ||
match current_tag { | ||
Tag::Title => { | ||
print!("page: '{}'\n", event.unescape_and_decode(&xml_reader).unwrap()); | ||
match event.unescaped() { | ||
Ok(ref buffer) => { | ||
// TODO keep previous offset for index | ||
writer.write_all(buffer).unwrap(); | ||
writer.write_u8(0).unwrap(); | ||
} | ||
Err(_) => (), // ignore encoding error in the dump | ||
} | ||
} | ||
Tag::UserName => { | ||
print!("\tuser: '{}'\n", event.unescape_and_decode(&xml_reader).unwrap()); | ||
// TODO add user to map if not present, with an empty u32 list | ||
// TODO add previous page name offset to user list | ||
}, | ||
Tag::Other => (), | ||
} | ||
}, | ||
Err(error) => panic!("XML parsing error at position {}: {:?}", xml_reader.buffer_position(), error), | ||
Ok(Event::Eof) => break, | ||
_ => (), | ||
} | ||
buffer.clear(); | ||
} | ||
// TODO for each user (alphabetically) | ||
// TODO keep previous offset for index | ||
// TODO write page offsets list length | ||
// TODO write page offsets list | ||
// TODO keep previous offset for header | ||
// TODO write username -> user offset mapping as FST | ||
// TODO write FST offset as u32 | ||
Ok(()) | ||
} | ||
|
||
pub fn query(index: String, users: &Vec<String>) -> Result<(), ()> { | ||
println!("querying users on index '{}':", index); | ||
// TODO read last u32 -> offset of the FST | ||
// TODO mmap FST | ||
for user in users { | ||
println!("\t{}", user); | ||
// TODO lookup user in FST -> page offsets list offset for that user | ||
// TODO add (username, offset) to list | ||
} | ||
unimplemented!(); | ||
// TODO heap merge of all lists | ||
// TODO while ! heap.is_empty() | ||
// TODO while heap.peek() is same | ||
// TODO keep users | ||
// TODO update co-occurrence matrix | ||
// TODO write CSV line with page name, number of users, user names | ||
// TODO write CSV co-occurence matrix | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#! /bin/sh | ||
|
||
BASEDIR="$(dirname "$(basename "$(readlink -f "$0")")")" | ||
|
||
curl "https://dumps.wikimedia.org/frwiki/latest/frwiki-latest-stub-meta-history.xml.gz" | | ||
gunzip | | ||
"$BASEDIR/target/release/socksfinder" build "frwiki-latest.idx" > /dev/null |