Skip to content

Commit

Permalink
add html filter
Browse files Browse the repository at this point in the history
  • Loading branch information
hannob committed Jul 25, 2024
1 parent 236b701 commit 0e1af78
Showing 1 changed file with 12 additions and 2 deletions.
14 changes: 12 additions & 2 deletions keyfinder
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ import os
import re
import sys
import urllib.parse
import warnings

import bs4
import requests
from cryptography.hazmat.primitives import serialization

Expand All @@ -24,7 +26,12 @@ def filter_unesc(inkey):
return inkey.encode().decode("unicode_escape")


kfilters = [filter_none, filter_unesc]
def filter_html(inkey):
html = bs4.BeautifulSoup(inkey, "lxml")
return html.get_text()


kfilters = [filter_none, filter_unesc, filter_html]


def findkeys(data):
Expand Down Expand Up @@ -72,13 +79,16 @@ def writekey(key, fn, path):


if __name__ == "__main__":

ap = argparse.ArgumentParser()
ap.add_argument("input", nargs="+")
ap.add_argument("-o", "--outdir")
ap.add_argument("-u", "--url", action="store_true", help="URL instead of dir")
args = ap.parse_args()

# Prevents BeautifulSoup warnings, e.g., when content only
# contains a single URL or a filename.
warnings.filterwarnings("ignore", category=bs4.MarkupResemblesLocatorWarning)

if args.url:
for url in args.input:
host = urllib.parse.urlparse(url).netloc
Expand Down

0 comments on commit 0e1af78

Please sign in to comment.