From 0e1af78401ea4bd7a3d2e00adc9ebbf804476317 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hanno=20B=C3=B6ck?= <990588+hannob@users.noreply.github.com> Date: Thu, 25 Jul 2024 16:00:33 +0200 Subject: [PATCH] add html filter --- keyfinder | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/keyfinder b/keyfinder index 02b6574..3a189dd 100755 --- a/keyfinder +++ b/keyfinder @@ -5,7 +5,9 @@ import os import re import sys import urllib.parse +import warnings +import bs4 import requests from cryptography.hazmat.primitives import serialization @@ -24,7 +26,12 @@ def filter_unesc(inkey): return inkey.encode().decode("unicode_escape") -kfilters = [filter_none, filter_unesc] +def filter_html(inkey): + html = bs4.BeautifulSoup(inkey, "lxml") + return html.get_text() + + +kfilters = [filter_none, filter_unesc, filter_html] def findkeys(data): @@ -72,13 +79,16 @@ def writekey(key, fn, path): if __name__ == "__main__": - ap = argparse.ArgumentParser() ap.add_argument("input", nargs="+") ap.add_argument("-o", "--outdir") ap.add_argument("-u", "--url", action="store_true", help="URL instead of dir") args = ap.parse_args() + # Prevents BeautifulSoup warnings, e.g., when content only + # contains a single URL or a filename. + warnings.filterwarnings("ignore", category=bs4.MarkupResemblesLocatorWarning) + if args.url: for url in args.input: host = urllib.parse.urlparse(url).netloc