diff --git a/src/util.cc b/src/util.cc index 896303a..052fbbc 100644 --- a/src/util.cc +++ b/src/util.cc @@ -1,6 +1,7 @@ #include "util.hh" #include #include +#include #include #include #include @@ -10,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -129,6 +132,24 @@ namespace util { f.close(); } + // Check if file has gzip magic number + // solution from https://stackoverflow.com/questions/37822645/c-read-and-compare-magic-number-from-file + bool isCompressedFile(const std::string &filename) { + std::ifstream input(filename, std::ios::binary); + if (!input.is_open()) { + BOOST_LOG_TRIVIAL(error) << "Could not open file '" << filename << "'"; + return false; + } + input.seekg(0, std::ios::beg); + unsigned char magic[2] = {0}; + input.read((char*)magic, sizeof(magic)); + const unsigned char magicref[2] = {0x1F, 0x8B}; + + if(memcmp(magic, magicref, sizeof(magic)) == 0) + return true; + return false; + } + void readUrlFiltersRegex(const std::string &filename, boost::regex &urlFilter) { std::ifstream f(filename); std::string line; @@ -154,6 +175,29 @@ namespace util { urlFilter.assign(combined.str(), boost::regex::optimize | boost::regex::nosubs); } + void readDomainFilters(const std::string &filename, std::unordered_set &domainFilter) { + // Check if file is compressed + // Seems that boost zlib does not complain if the file is not compressed + // is it really necessary to manually check the magic number? idk + if (!isCompressedFile(filename)) { + BOOST_LOG_TRIVIAL(error) << "Domain list file not gzip compressed '" << filename << "'"; + abort(); + } + + std::ifstream f(filename, std::ios_base::in | std::ios_base::binary); + boost::iostreams::filtering_stream in; + in.push(boost::iostreams::zlib_decompressor()); + in.push(f); + + std::string line; + for (size_t line_i=1; std::getline(in, line); ++line_i) { + if (boost::algorithm::all(line, boost::algorithm::is_space()) || boost::algorithm::starts_with(line, "#")) + continue; + domainFilter.emplace(std::string(line)); + } + f.close(); + } + bool createDirectories(const std::string& path){ if (!boost::filesystem::exists(path)) return boost::filesystem::create_directories(path); diff --git a/src/util.hh b/src/util.hh index a6f5702..ee807dc 100644 --- a/src/util.hh +++ b/src/util.hh @@ -19,6 +19,8 @@ namespace util { void trimLines(std::string& text); void trimLinesCopy(const std::string& original, std::string& result); + bool isCompressedFile(const std::string &filename); + // detect charset using uchardet bool detectCharset(const std::string& text, std::string& charset, const std::string& original_charset = ""); // convert to utf8 @@ -57,6 +59,8 @@ namespace util { void readUrlFiltersRegex(const std::string &filename, boost::regex &urlFilter); + void readDomainFilters(const std::string &filename, std::unordered_set &domainFilter); + bool createDirectories(const std::string& path); std::vector split(const std::string& s, const std::string& delimiter); diff --git a/src/warcpreprocessor.cc b/src/warcpreprocessor.cc index 6934c65..bab1721 100644 --- a/src/warcpreprocessor.cc +++ b/src/warcpreprocessor.cc @@ -62,6 +62,7 @@ namespace { namespace warc2text { const std::unordered_set WARCPreprocessor::removeExtensions = {".jpg", ".jpeg", ".gif", ".png", ".css", ".js", ".mp3", ".mp4", ".flv", ".wmv", ".gz", ".zip", ".rar" }; + const boost::regex WARCPreprocessor::domainExtractor("^(https?:\\/\\/)?(www\\.)?([^:\\/]+)(.*)", boost::regex::extended|boost::regex::icase); WARCPreprocessor::WARCPreprocessor(const LanguageDetector &detector, WARCPreprocessorOptions const &options) : detector(detector), @@ -80,6 +81,9 @@ namespace warc2text { if (!options.url_filters_filename.empty()) util::readUrlFiltersRegex(options.url_filters_filename, urlFilter); + if (!options.domain_filters_filename.empty()) + util::readDomainFilters(options.domain_filters_filename, domainFilter); + if (!options.pdf_warc_filename.empty()) pdf_warc_writer.open(options.pdf_warc_filename); @@ -101,6 +105,19 @@ namespace warc2text { return true; } + // true if the domain of the url is good + bool WARCPreprocessor::filterDomain(const std::string& url) const { + std::string domain = boost::regex_replace(url, domainExtractor, "$3"); + BOOST_LOG_TRIVIAL(trace) << "Domain extracted '" << domain << "'"; + + if (!domainFilter.empty() && domainFilter.find(domain) != domainFilter.end()) { + BOOST_LOG_TRIVIAL(trace) << "Domain filter matched '" << url << "'"; + return false; + } + + return true; + } + void WARCPreprocessor::process(const std::string& filename) { BOOST_LOG_TRIVIAL(info) << "Processing " << filename; WARCReader reader(filename); @@ -145,6 +162,9 @@ namespace warc2text { if (!URLfilter(record.getURL())) continue; + if (!filterDomain(record.getURL())) + continue; + if (options.encodeURLs) record.encodeURL(); diff --git a/src/warcpreprocessor.hh b/src/warcpreprocessor.hh index d4f85eb..f7b3954 100644 --- a/src/warcpreprocessor.hh +++ b/src/warcpreprocessor.hh @@ -37,6 +37,7 @@ namespace warc2text { bool tag_filters_invert{}; std::string url_filters_filename; + std::string domain_filters_filename; bool multilang{}; bool encodeURLs{}; @@ -57,9 +58,12 @@ namespace warc2text { unsigned int langBytes; util::umap_tag_filters_regex tagFilters; boost::regex urlFilter; - + std::unordered_set domainFilter; + static const std::unordered_set removeExtensions; + static const boost::regex domainExtractor; bool URLfilter(const std::string& url) const; + bool filterDomain(const std::string& url) const; public: explicit WARCPreprocessor(LanguageDetector const &detector, WARCPreprocessorOptions const &options); diff --git a/warc2text_main.cc b/warc2text_main.cc index e94e467..2d5b26b 100644 --- a/warc2text_main.cc +++ b/warc2text_main.cc @@ -33,6 +33,7 @@ void parseArgs(int argc, char *argv[], Options& out) { ("tag-filters", po::value(&out.tag_filters_filename), "Plain text file containing tag filters") ("invert-tag-filters", po::bool_switch(&out.tag_filters_invert)->default_value(false), "Invert tag filter application") ("url-filters", po::value(&out.url_filters_filename), "Plain text file containing url filters") + ("domain-filters", po::value(&out.domain_filters_filename), "Gzip compressed text file containing domain filters") ("pdfpass", po::value(&out.pdf_warc_filename), "Write PDF records to WARC") ("robotspass", po::value(&out.robots_warc_filename), "Write robots.txt records to WARC") ("paragraph-identification", po::bool_switch(&out.paragraph_identification)->default_value(false), "Add paragraph index in each b64encoded document as tab separated column") @@ -65,6 +66,8 @@ void parseArgs(int argc, char *argv[], Options& out) { " --invert-tag-filters Only output records that got filtered\n" " --url-filters File containing url filters\n" " Format: \"regexp\"\n" + " --domain-filters File containing domain filters\n" + " Format: each line containing a domain name\n" " --pdfpass Write PDF records to \n" " --robotspass Write Robots.txt records to \n" " --encode-urls Encode URLs obtained from WARC records\n"