Skip to content

Commit

Permalink
quick workaround for #20
Browse files Browse the repository at this point in the history
  • Loading branch information
zuny26 committed Jan 21, 2021
1 parent fd73cc2 commit 1b6068a
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 5 deletions.
4 changes: 2 additions & 2 deletions src/warcpreprocessor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace warc2text {

while (!done) {
done = !reader.getRecord(content);
if (done)
if (done or content.empty())
continue;

Record record(content);
Expand All @@ -67,7 +67,7 @@ namespace warc2text {
// Work-around for https://github.com/bitextor/warc2text/issues/16 for ParaCrawl
// we do not really have a use case for massive PDFs at this moment. Skip em.
if (content.size() >= static_cast<std::size_t>(std::numeric_limits<uInt>::max())) {
BOOST_LOG_TRIVIAL(info) << "PDF too large to compress with util::BZCompress";
BOOST_LOG_TRIVIAL(info) << "PDF too large to compress with util::GZCompress";
continue;
}

Expand Down
10 changes: 8 additions & 2 deletions src/warcreader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ namespace warc2text {
closeFile();
}

bool WARCReader::getRecord(std::string& out){
bool WARCReader::getRecord(std::string& out, std::size_t max_size){
int inflate_ret = 0;
out.clear();
std::size_t len;
bool skip_record = false;
while (inflate_ret != Z_STREAM_END) {
if (s.avail_in == 0) {
len = readChunk();
Expand All @@ -55,7 +56,12 @@ namespace warc2text {
out.clear();
return false;
}
out.append(scratch, scratch + (BUFFER_SIZE - s.avail_out));
if (not skip_record) out.append(scratch, scratch + (BUFFER_SIZE - s.avail_out));
if (out.size() > max_size) {
BOOST_LOG_TRIVIAL(error) << "WARC " << warc_filename << ": skipping large record";
out.clear();
skip_record = true;
}
}
if (inflate_ret == Z_STREAM_END) {
assert(inflateReset(&s) == Z_OK);
Expand Down
2 changes: 1 addition & 1 deletion src/warcreader.hh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace warc2text {
public:
WARCReader();
explicit WARCReader(const std::string& filename);
bool getRecord(std::string& out);
bool getRecord(std::string& out, std::size_t max_size = 1024*1024*20); //20MB
~WARCReader();
private:
std::FILE* file;
Expand Down

0 comments on commit 1b6068a

Please sign in to comment.