Skip to content

Commit

Permalink
fix for
Browse files Browse the repository at this point in the history
  • Loading branch information
spencermountain committed Dec 3, 2018
1 parent 161e355 commit bd010f5
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
9 changes: 6 additions & 3 deletions scratch.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@ const drop = require('./src/lib/drop-db');
//144mb → 2.5 minutes = 57mb per worker per minute
// const path = '/home/spencer/mountain/dumpster-dive/tests/tinywiki-latest-pages-articles.xml';
// const path = '/media/spencer/07d11766-2ce6-4f8a-8ec0-a3d144a3d4cd/big_data/wikipedia/enwiki-latest-pages-articles.xml';
const path = '/Users/spencer/data/wikipedia/enwiki-latest-pages-articles.xml'
// const path = '/Users/spencer/data/wikipedia/enwiki-latest-pages-articles.xml'
// const path = './tests/smallwiki-latest-pages-articles.xml'; //3s
const path = '/Users/spencer/data/wikipedia/twinpeaks_pages_current.xml'; //3s
// const path = './tests/tinywiki-latest-pages-articles.xml'; //2s
const dbName = path.match(/\/([a-z-]+)-latest-pages/)[1];
const dbName = 'twinpeaks' //path.match(/\/([a-z-]+)-latest-pages/)[1];

//db.pages.find({title:'Doppelgängers'})

let options = {
file: path,
db: dbName,
Expand All @@ -32,7 +36,6 @@ let options = {
// }
};


// #1 - Pous Adrianus I
// #2 - Beenvis
// #2 - Makriel
Expand Down
3 changes: 2 additions & 1 deletion src/worker/01-parsePage.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@ const parsePage = function(txt, worker) {
console.log('--no page id--');
}
//get wiki text
m = txt.match(/<text xml:space="preserve">([\s\S]*?)<\/text>/);
m = txt.match(/<text xml:space="preserve"([\s\S]*?)<\/text>/);
if (m !== null) {
m[1] = m[1].replace(/^.*?>/, '')
page.wiki = m[1];
}
return page;
Expand Down

0 comments on commit bd010f5

Please sign in to comment.