From c2c0ae26a866aab613b8b5bd3dca31d251ec41b0 Mon Sep 17 00:00:00 2001 From: Esa Jokinen Date: Wed, 17 Jul 2024 16:09:10 +0300 Subject: [PATCH 1/7] Use fresh top500 list from moz.com Refresh top500 list every time sites.py is imported. Keep cached list in top500.json and use it as a backup. --- sites.py | 559 ++++++---------------------------------------------- top500.json | 502 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 558 insertions(+), 503 deletions(-) create mode 100644 top500.json diff --git a/sites.py b/sites.py index 835ca05..2809422 100644 --- a/sites.py +++ b/sites.py @@ -1,504 +1,57 @@ -# https://moz.com/top500 +# Fetch top500 sites from https://moz.com/top500 with fallback to cache file +import csv +import json +import urllib.request +import urllib.error -top500 = { - 1: "www.google.com", - 2: "www.blogger.com", - 3: "youtube.com", - 4: "linkedin.com", - 5: "microsoft.com", - 6: "apple.com", - 7: "play.google.com", - 8: "support.google.com", - 9: "mozilla.org", - 10: "wordpress.org", - 11: "youtu.be", - 12: "docs.google.com", - 13: "en.wikipedia.org", - 14: "cloudflare.com", - 15: "maps.google.com", - 16: "googleusercontent.com", - 17: "bp.blogspot.com", - 18: "drive.google.com", - 19: "sites.google.com", - 20: "accounts.google.com", - 21: "adobe.com", - 22: "plus.google.com", - 23: "europa.eu", - 24: "whatsapp.com", - 25: "istockphoto.com", - 26: "es.wikipedia.org", - 27: "facebook.com", - 28: "vk.com", - 29: "uol.com.br", - 30: "github.com", - 31: "t.me", - 32: "amazon.com", - 33: "vimeo.com", - 34: "news.google.com", - 35: "search.google.com", - 36: "enable-javascript.com", - 37: "myspace.com", - 38: "google.co.jp", - 39: "cpanel.net", - 40: "jimdofree.com", - 41: "mail.google.com", - 42: "w3.org", - 43: "policies.google.com", - 44: "who.int", - 45: "terra.com.br", - 46: "reuters.com", - 47: "paypal.com", - 48: "dropbox.com", - 49: "imdb.com", - 50: "google.com.br", - 51: "globo.com", - 52: "live.com", - 53: "google.es", - 54: "dailymotion.com", - 55: "google.de", - 56: "forbes.com", - 57: "developers.google.com", - 58: "files.wordpress.com", - 59: "medium.com", - 60: "bbc.co.uk", - 61: "bbc.com", - 62: "cnn.com", - 63: "wikimedia.org", - 64: "creativecommons.org", - 65: "brandbucket.com", - 66: "feedburner.com", - 67: "www.weebly.com", - 68: "tools.google.com", - 69: "slideshare.net", - 70: "theguardian.com", - 71: "pt.wikipedia.org", - 72: "fr.wikipedia.org", - 73: "line.me", - 74: "youronlinechoices.com", - 75: "nytimes.com", - 76: "gstatic.com", - 77: "www.yahoo.com", - 78: "nih.gov", - 79: "fb.com", - 80: "google.ru", - 81: "tinyurl.com", - 82: "t.co", - 83: "aliexpress.com", - 84: "photos.google.com", - 85: "wired.com", - 86: "amazon.de", - 87: "ok.ru", - 88: "bloomberg.com", - 89: "foxnews.com", - 90: "wa.me", - 91: "indiatimes.com", - 92: "huffpost.com", - 93: "wsj.com", - 94: "amazon.co.uk", - 95: "4shared.com", - 96: "businessinsider.com", - 97: "goo.gl", - 98: "google.fr", - 99: "buydomains.com", - 100: "issuu.com", - 101: "ig.com.br", - 102: "independent.co.uk", - 103: "www.gov.br", - 104: "afternic.com", - 105: "rakuten.co.jp", - 106: "draft.blogger.com", - 107: "elmundo.es", - 108: "ru.wikipedia.org", - 109: "de.wikipedia.org", - 110: "abril.com.br", - 111: "usatoday.com", - 112: "scribd.com", - 113: "ipv4.google.com", - 114: "mediafire.com", - 115: "forms.gle", - 116: "namecheap.com", - 117: "amazon.co.jp", - 118: "washingtonpost.com", - 119: "id.wikipedia.org", - 120: "plesk.com", - 121: "myaccount.google.com", - 122: "booking.com", - 123: "hugedomains.com", - 124: "mirror.co.uk", - 125: "cpanel.com", - 126: "ytimg.com", - 127: "books.google.com", - 128: "huffingtonpost.com", - 129: "shutterstock.com", - 130: "nature.com", - 131: "dailymail.co.uk", - 132: "twitter.com", - 133: "elpais.com", - 134: "pixabay.com", - 135: "researchgate.net", - 136: "google.it", - 137: "telegram.me", - 138: "fandom.com", - 139: "cdc.gov", - 140: "list-manage.com", - 141: "un.org", - 142: "marketingplatform.google....", - 143: "news.yahoo.com", - 144: "cnet.com", - 145: "gravatar.com", - 146: "bit.ly", - 147: "pinterest.com", - 148: "office.com", - 149: "netvibes.com", - 150: "telegraph.co.uk", - 151: "opera.com", - 152: "dan.com", - 153: "android.com", - 154: "msn.com", - 155: "estadao.com.br", - 156: "nasa.gov", - 157: "aboutads.info", - 158: "time.com", - 159: "wp.com", - 160: "get.google.com", - 161: "change.org", - 162: "hatena.ne.jp", - 163: "mail.ru", - 164: "wikia.com", - 165: "networkadvertising.org", - 166: "thesun.co.uk", - 167: "google.pl", - 168: "amazon.fr", - 169: "google.co.uk", - 170: "amazon.es", - 171: "archive.org", - 172: "ja.wikipedia.org", - 173: "express.co.uk", - 174: "www.gov.uk", - 175: "translate.google.com", - 176: "picasaweb.google.com", - 177: "economist.com", - 178: "outlook.com", - 179: "samsung.com", - 180: "lycos.com", - 181: "weibo.com", - 182: "whitehouse.gov", - 183: "akamaized.net", - 184: "gizmodo.com", - 185: "quora.com", - 186: "google.ca", - 187: "qq.com", - 188: "ikea.com", - 189: "mozilla.com", - 190: "goodreads.com", - 191: "akamaihd.net", - 192: "lemonde.fr", - 193: "theatlantic.com", - 194: "bp1.blogger.com", - 195: "ea.com", - 196: "cambridge.org", - 197: "springer.com", - 198: "nikkei.com", - 199: "code.google.com", - 200: "imageshack.us", - 201: "calendar.google.com", - 202: "feedproxy.google.com", - 203: "sputniknews.com", - 204: "insider.com", - 205: "addtoany.com", - 206: "pexels.com", - 207: "sciencedaily.com", - 208: "www.wikipedia.org", - 209: "prezi.com", - 210: "dw.com", - 211: "amazon.it", - 212: "berkeley.edu", - 213: "yandex.ru", - 214: "detik.com", - 215: "sedo.com", - 216: "ziddu.com", - 217: "bitly.com", - 218: "metro.co.uk", - 219: "t-online.de", - 220: "icann.org", - 221: "pbs.org", - 222: "it.wikipedia.org", - 223: "lavanguardia.com", - 224: "bp2.blogger.com", - 225: "thetimes.co.uk", - 226: "google.co.in", - 227: "liberation.fr", - 228: "ggpht.com", - 229: "unesco.org", - 230: "privacyshield.gov", - 231: "xbox.com", - 232: "timeweb.ru", - 233: "spotify.com", - 234: "yadi.sk", - 235: "cbc.ca", - 236: "abcnews.go.com", - 237: "leparisien.fr", - 238: "bild.de", - 239: "www.over-blog.com", - 240: "cornell.edu", - 241: "clarin.com", - 242: "nginx.com", - 243: "gofundme.com", - 244: "rtve.es", - 245: "sciencemag.org", - 246: "soundcloud.com", - 247: "chicagotribune.com", - 248: "discord.gg", - 249: "search.yahoo.com", - 250: "sfgate.com", - 251: "wiley.com", - 252: "ft.com", - 253: "netlify.app", - 254: "yelp.com", - 255: "bloglovin.com", - 256: "alexa.com", - 257: "abc.es", - 258: "urbandictionary.com", - 259: "interia.pl", - 260: "smh.com.au", - 261: "newyorker.com", - 262: "cnil.fr", - 263: "rt.com", - 264: "gooyaabitemplates.com", - 265: "gmail.com", - 266: "dailystar.co.uk", - 267: "cnbc.com", - 268: "doi.org", - 269: "www.wix.com", - 270: "cbsnews.com", - 271: "surveymonkey.com", - 272: "plos.org", - 273: "gnu.org", - 274: "google.nl", - 275: "francetvinfo.fr", - 276: "sapo.pt", - 277: "psychologytoday.com", - 278: "ria.ru", - 279: "offset.com", - 280: "discord.com", - 281: "guardian.co.uk", - 282: "as.com", - 283: "godaddy.com", - 284: "ca.gov", - 285: "academia.edu", - 286: "aol.com", - 287: "nydailynews.com", - 288: "freepik.com", - 289: "vice.com", - 290: "canada.ca", - 291: "engadget.com", - 292: "sciencedirect.com", - 293: "target.com", - 294: "mit.edu", - 295: "nginx.org", - 296: "sendspace.com", - 297: "twitch.tv", - 298: "finance.yahoo.com", - 299: "e-monsite.com", - 300: "instagram.com", - 301: "stanford.edu", - 302: "wikihow.com", - 303: "kickstarter.com", - 304: "eventbrite.com", - 305: "disney.com", - 306: "welt.de", - 307: "npr.org", - 308: "webmd.com", - 309: "mashable.com", - 310: "pinterest.fr", - 311: "skype.com", - 312: "tiktok.com", - 313: "addthis.com", - 314: "pl.wikipedia.org", - 315: "apache.org", - 316: "adssettings.google.com", - 317: "yahoo.co.jp", - 318: "tripadvisor.com", - 319: "washington.edu", - 320: "oracle.com", - 321: "britannica.com", - 322: "hm.com", - 323: "deezer.com", - 324: "buzzfeed.com", - 325: "google.com.tw", - 326: "bing.com", - 327: "arxiv.org", - 328: "techcrunch.com", - 329: "standard.co.uk", - 330: "ted.com", - 331: "alibaba.com", - 332: "zoom.us", - 333: "ebay.com", - 334: "rapidshare.com", - 335: "fb.me", - 336: "shopify.com", - 337: "mega.nz", - 338: "lefigaro.fr", - 339: "zdf.de", - 340: "amzn.to", - 341: "picasa.google.com", - 342: "playstation.com", - 343: "photos1.blogger.com", - 344: "secureserver.net", - 345: "themeforest.net", - 346: "hp.com", - 347: "ietf.org", - 348: "netflix.com", - 349: "ibm.com", - 350: "storage.googleapis.com", - 351: "oup.com", - 352: "php.net", - 353: "tmz.com", - 354: "canva.com", - 355: "naver.com", - 356: "zendesk.com", - 357: "disqus.com", - 358: "news.com.au", - 359: "theverge.com", - 360: "about.com", - 361: "ovh.com", - 362: "groups.google.com", - 363: "repubblica.it", - 364: "photobucket.com", - 365: "newsweek.com", - 366: "thenai.org", - 367: "20minutos.es", - 368: "ovh.net", - 369: "nationalgeographic.com", - 370: "nbcnews.com", - 371: "instructables.com", - 372: "doubleclick.net", - 373: "m.wikipedia.org", - 374: "biglobe.ne.jp", - 375: "hollywoodreporter.com", - 376: "alicdn.com", - 377: "harvard.edu", - 378: "steampowered.com", - 379: "latimes.com", - 380: "spiegel.de", - 381: "bandcamp.com", - 382: "huawei.com", - 383: "googleblog.com", - 384: "clickbank.net", - 385: "espn.com", - 386: "walmart.com", - 387: "ign.com", - 388: "abc.net.au", - 389: "variety.com", - 390: "dreamstime.com", - 391: "nypost.com", - 392: "upenn.edu", - 393: "businesswire.com", - 394: "boston.com", - 395: "ssl-images-amazon.com", - 396: "politico.com", - 397: "nba.com", - 398: "apnews.com", - 399: "dreniq.com", - 400: "asahi.com", - 401: "statista.com", - 402: "mayoclinic.org", - 403: "venturebeat.com", - 404: "scmp.com", - 405: "trustpilot.com", - 406: "depositfiles.com", - 407: "giphy.com", - 408: "ftc.gov", - 409: "last.fm", - 410: "video.google.com", - 411: "sony.com", - 412: "thefreedictionary.com", - 413: "unicef.org", - 414: "kompas.com", - 415: "biblegateway.com", - 416: "twimg.com", - 417: "asus.com", - 418: "nokia.com", - 419: "stackoverflow.com", - 420: "ap.org", - 421: "dictionary.com", - 422: "sina.com.cn", - 423: "eonline.com", - 424: "etsy.com", - 425: "uefa.com", - 426: "focus.de", - 427: "utexas.edu", - 428: "nicovideo.jp", - 429: "techradar.com", - 430: "evernote.com", - 431: "loc.gov", - 432: "merriam-webster.com", - 433: "soratemplates.com", - 434: "ucla.edu", - 435: "flickr.com", - 436: "lg.com", - 437: "dot.tk", - 438: "google.co.id", - 439: "blog.google", - 440: "imageshack.com", - 441: "fda.gov", - 442: "fortune.com", - 443: "usgs.gov", - 444: "box.com", - 445: "mercurynews.com", - 446: "yandex.com", - 447: "example.com", - 448: "iso.org", - 449: "amazon.ca", - 450: "usnews.com", - 451: "people.com", - 452: "python.org", - 453: "timeout.com", - 454: "kakao.com", - 455: "stores.jp", - 456: "softonic.com", - 457: "pinterest.co.uk", - 458: "nifty.com", - 459: "namesilo.com", - 460: "fifa.com", - 461: "state.gov", - 462: "nhk.or.jp", - 463: "thoughtco.com", - 464: "billboard.com", - 465: "cointernet.com.co", - 466: "hbr.org", - 467: "jhu.edu", - 468: "ucoz.ru", - 469: "linktr.ee", - 470: "bp3.blogger.com", - 471: "pcmag.com", - 472: "www.livejournal.com", - 473: "sakura.ne.jp", - 474: "pnas.org", - 475: "ovh.co.uk", - 476: "vkontakte.ru", - 477: "rfi.fr", - 478: "fastcompany.com", - 479: "groups.yahoo.com", - 480: "mixcloud.com", - 481: "xda-developers.com", - 482: "weather.com", - 483: "liveinternet.ru", - 484: "gettyimages.com", - 485: "over-blog-kiwi.com", - 486: "vox.com", - 487: "debian.org", - 488: "g.co", - 489: "nvidia.com", - 490: "rambler.ru", - 491: "bustle.com", - 492: "oreilly.com", - 493: "usda.gov", - 494: "princeton.edu", - 495: "ebay.co.uk", - 496: "archives.gov", - 497: "wn.com", - 498: "imgur.com", - 499: "thenextweb.com", - 500: "digitaltrends.com", -} +from json_logger import log + +TOP_SITES_SOURCE = "https://moz.com/top-500/download?table=top500Domains" +CACHE_FILE = "top500.json" + +top500 = {} + +try: + log(target="INFO", message=f"updating top500 from {TOP_SITES_SOURCE}") + + req = urllib.request.Request( + TOP_SITES_SOURCE, + data=None, + headers={"User-Agent": "Mozilla/5.0 (OllieJC/findsecuritycontacts.com)"}, + ) + resp = urllib.request.urlopen(req) + sites = csv.DictReader([l.decode("utf-8") for l in resp.readlines()]) + + for site in sites: + try: + top500[int(site["Rank"])] = site["Root Domain"] + except ValueError as e: + log(target="ERROR", error=e) + + try: + log(target="INFO", message=f"saving updated top500 list as {CACHE_FILE}") + with open(CACHE_FILE, "w") as cachefile: + json.dump(top500, cachefile, indent=2) + except (OSError, Exception) as e: + log(target="ERROR", error=e) + +except ( + urllib.error.URLError, + urllib.error.HTTPError, + urllib.error.ContentTooShortError, +) as e: + log(target="ERROR", message="fetching top sites failed", error=e) + log(target="INFO", message=f"falling back to cache file {CACHE_FILE}") + + try: + with open(CACHE_FILE, "r") as cachefile: + top500 = {int(k): str(v) for k, v in dict(json.load(cachefile)).items()} + except ValueError as e: + log(target="ERROR", message="non-numerical keys in cache file", error=e) + except (OSError, Exception) as e: + log(target="ERROR", message="loading of cache file failed", error=e) + + +if __name__ == "__main__": + """Print the resulting top500 dictionary when called directly""" + print(top500) diff --git a/top500.json b/top500.json new file mode 100644 index 0000000..9031a34 --- /dev/null +++ b/top500.json @@ -0,0 +1,502 @@ +{ + "1": "www.google.com", + "2": "www.blogger.com", + "3": "youtube.com", + "4": "linkedin.com", + "5": "support.google.com", + "6": "cloudflare.com", + "7": "microsoft.com", + "8": "apple.com", + "9": "en.wikipedia.org", + "10": "play.google.com", + "11": "wordpress.org", + "12": "docs.google.com", + "13": "mozilla.org", + "14": "maps.google.com", + "15": "youtu.be", + "16": "drive.google.com", + "17": "bp.blogspot.com", + "18": "sites.google.com", + "19": "googleusercontent.com", + "20": "accounts.google.com", + "21": "t.me", + "22": "europa.eu", + "23": "plus.google.com", + "24": "whatsapp.com", + "25": "adobe.com", + "26": "facebook.com", + "27": "policies.google.com", + "28": "uol.com.br", + "29": "istockphoto.com", + "30": "vimeo.com", + "31": "vk.com", + "32": "github.com", + "33": "amazon.com", + "34": "search.google.com", + "35": "bbc.co.uk", + "36": "google.de", + "37": "live.com", + "38": "gravatar.com", + "39": "nih.gov", + "40": "dan.com", + "41": "files.wordpress.com", + "42": "www.yahoo.com", + "43": "cnn.com", + "44": "dropbox.com", + "45": "wikimedia.org", + "46": "creativecommons.org", + "47": "google.com.br", + "48": "line.me", + "49": "googleblog.com", + "50": "opera.com", + "51": "es.wikipedia.org", + "52": "globo.com", + "53": "brandbucket.com", + "54": "myspace.com", + "55": "slideshare.net", + "56": "paypal.com", + "57": "tiktok.com", + "58": "netvibes.com", + "59": "theguardian.com", + "60": "who.int", + "61": "goo.gl", + "62": "medium.com", + "63": "tools.google.com", + "64": "draft.blogger.com", + "65": "pt.wikipedia.org", + "66": "fr.wikipedia.org", + "67": "www.weebly.com", + "68": "news.google.com", + "69": "developers.google.com", + "70": "w3.org", + "71": "mail.google.com", + "72": "gstatic.com", + "73": "jimdofree.com", + "74": "cpanel.net", + "75": "imdb.com", + "76": "wa.me", + "77": "feedburner.com", + "78": "enable-javascript.com", + "79": "nytimes.com", + "80": "workspace.google.com", + "81": "ok.ru", + "82": "google.es", + "83": "dailymotion.com", + "84": "afternic.com", + "85": "bloomberg.com", + "86": "amazon.de", + "87": "photos.google.com", + "88": "wiley.com", + "89": "aliexpress.com", + "90": "indiatimes.com", + "91": "youronlinechoices.com", + "92": "elpais.com", + "93": "tinyurl.com", + "94": "yadi.sk", + "95": "spotify.com", + "96": "huffpost.com", + "97": "ru.wikipedia.org", + "98": "google.fr", + "99": "webmd.com", + "100": "samsung.com", + "101": "independent.co.uk", + "102": "amazon.co.jp", + "103": "get.google.com", + "104": "amazon.co.uk", + "105": "4shared.com", + "106": "telegram.me", + "107": "planalto.gov.br", + "108": "businessinsider.com", + "109": "ig.com.br", + "110": "issuu.com", + "111": "www.gov.br", + "112": "wsj.com", + "113": "hugedomains.com", + "114": "picasaweb.google.com", + "115": "usatoday.com", + "116": "scribd.com", + "117": "www.gov.uk", + "118": "storage.googleapis.com", + "119": "huffingtonpost.com", + "120": "bbc.com", + "121": "estadao.com.br", + "122": "nature.com", + "123": "mediafire.com", + "124": "washingtonpost.com", + "125": "forms.gle", + "126": "namecheap.com", + "127": "forbes.com", + "128": "mirror.co.uk", + "129": "soundcloud.com", + "130": "fb.com", + "131": "marketingplatform.google....", + "132": "domainmarket.com", + "133": "ytimg.com", + "134": "terra.com.br", + "135": "google.co.uk", + "136": "shutterstock.com", + "137": "dailymail.co.uk", + "138": "reg.ru", + "139": "t.co", + "140": "cdc.gov", + "141": "thesun.co.uk", + "142": "wp.com", + "143": "cnet.com", + "144": "instagram.com", + "145": "researchgate.net", + "146": "google.it", + "147": "fandom.com", + "148": "office.com", + "149": "list-manage.com", + "150": "msn.com", + "151": "un.org", + "152": "de.wikipedia.org", + "153": "ovh.com", + "154": "mail.ru", + "155": "bing.com", + "156": "news.yahoo.com", + "157": "myaccount.google.com", + "158": "hatena.ne.jp", + "159": "shopify.com", + "160": "adssettings.google.com", + "161": "bit.ly", + "162": "reuters.com", + "163": "booking.com", + "164": "discord.com", + "165": "buydomains.com", + "166": "nasa.gov", + "167": "aboutads.info", + "168": "time.com", + "169": "abril.com.br", + "170": "change.org", + "171": "nginx.org", + "172": "twitter.com", + "173": "www.wikipedia.org", + "174": "archive.org", + "175": "cbsnews.com", + "176": "networkadvertising.org", + "177": "telegraph.co.uk", + "178": "pinterest.com", + "179": "google.co.jp", + "180": "pixabay.com", + "181": "zendesk.com", + "182": "cpanel.com", + "183": "vistaprint.com", + "184": "sky.com", + "185": "windows.net", + "186": "alicdn.com", + "187": "google.ca", + "188": "lemonde.fr", + "189": "newyorker.com", + "190": "webnode.page", + "191": "surveymonkey.com", + "192": "translate.google.com", + "193": "calendar.google.com", + "194": "amazonaws.com", + "195": "academia.edu", + "196": "apache.org", + "197": "imageshack.us", + "198": "akamaihd.net", + "199": "nginx.com", + "200": "discord.gg", + "201": "thetimes.co.uk", + "202": "search.yahoo.com", + "203": "amazon.fr", + "204": "yelp.com", + "205": "berkeley.edu", + "206": "google.ru", + "207": "sedoparking.com", + "208": "cbc.ca", + "209": "unesco.org", + "210": "ggpht.com", + "211": "privacyshield.gov", + "212": "www.over-blog.com", + "213": "clarin.com", + "214": "www.wix.com", + "215": "whitehouse.gov", + "216": "icann.org", + "217": "gnu.org", + "218": "yandex.ru", + "219": "francetvinfo.fr", + "220": "gmail.com", + "221": "mozilla.com", + "222": "ziddu.com", + "223": "guardian.co.uk", + "224": "twitch.tv", + "225": "sedo.com", + "226": "foxnews.com", + "227": "rambler.ru", + "228": "books.google.com", + "229": "stanford.edu", + "230": "wikihow.com", + "231": "it.wikipedia.org", + "232": "20minutos.es", + "233": "sfgate.com", + "234": "liveinternet.ru", + "235": "ja.wikipedia.org", + "236": "000webhost.com", + "237": "espn.com", + "238": "eventbrite.com", + "239": "disney.com", + "240": "statista.com", + "241": "addthis.com", + "242": "pinterest.fr", + "243": "lavanguardia.com", + "244": "vkontakte.ru", + "245": "doubleclick.net", + "246": "bp2.blogger.com", + "247": "skype.com", + "248": "sciencedaily.com", + "249": "bloglovin.com", + "250": "insider.com", + "251": "pl.wikipedia.org", + "252": "sputniknews.com", + "253": "id.wikipedia.org", + "254": "doi.org", + "255": "nypost.com", + "256": "elmundo.es", + "257": "abcnews.go.com", + "258": "ipv4.google.com", + "259": "deezer.com", + "260": "express.co.uk", + "261": "detik.com", + "262": "mystrikingly.com", + "263": "rakuten.co.jp", + "264": "amzn.to", + "265": "arxiv.org", + "266": "alibaba.com", + "267": "fb.me", + "268": "wikia.com", + "269": "t-online.de", + "270": "telegra.ph", + "271": "mega.nz", + "272": "usnews.com", + "273": "plos.org", + "274": "naver.com", + "275": "ibm.com", + "276": "smh.com.au", + "277": "dw.com", + "278": "google.nl", + "279": "lefigaro.fr", + "280": "bp1.blogger.com", + "281": "picasa.google.com", + "282": "theatlantic.com", + "283": "nydailynews.com", + "284": "themeforest.net", + "285": "rtve.es", + "286": "newsweek.com", + "287": "ovh.net", + "288": "ca.gov", + "289": "goodreads.com", + "290": "economist.com", + "291": "target.com", + "292": "marca.com", + "293": "kickstarter.com", + "294": "hindustantimes.com", + "295": "weibo.com", + "296": "finance.yahoo.com", + "297": "huawei.com", + "298": "e-monsite.com", + "299": "hubspot.com", + "300": "npr.org", + "301": "netflix.com", + "302": "gizmodo.com", + "303": "netlify.app", + "304": "yandex.com", + "305": "mashable.com", + "306": "cnil.fr", + "307": "latimes.com", + "308": "steampowered.com", + "309": "rt.com", + "310": "photobucket.com", + "311": "quora.com", + "312": "nbcnews.com", + "313": "android.com", + "314": "instructables.com", + "315": "www.canalblog.com", + "316": "www.livejournal.com", + "317": "ouest-france.fr", + "318": "tripadvisor.com", + "319": "ovhcloud.com", + "320": "pexels.com", + "321": "oracle.com", + "322": "yahoo.co.jp", + "323": "addtoany.com", + "324": "sakura.ne.jp", + "325": "cointernet.com.co", + "326": "twimg.com", + "327": "britannica.com", + "328": "php.net", + "329": "standard.co.uk", + "330": "groups.google.com", + "331": "cnbc.com", + "332": "loc.gov", + "333": "qq.com", + "334": "buzzfeed.com", + "335": "godaddy.com", + "336": "ikea.com", + "337": "disqus.com", + "338": "taringa.net", + "339": "ea.com", + "340": "dropcatch.com", + "341": "techcrunch.com", + "342": "canva.com", + "343": "offset.com", + "344": "ebay.com", + "345": "zoom.us", + "346": "cambridge.org", + "347": "unsplash.com", + "348": "playstation.com", + "349": "people.com", + "350": "springer.com", + "351": "psychologytoday.com", + "352": "sendspace.com", + "353": "home.pl", + "354": "rapidshare.com", + "355": "prezi.com", + "356": "photos1.blogger.com", + "357": "thenai.org", + "358": "ftc.gov", + "359": "google.pl", + "360": "ted.com", + "361": "secureserver.net", + "362": "code.google.com", + "363": "plesk.com", + "364": "aol.com", + "365": "biglobe.ne.jp", + "366": "hp.com", + "367": "canada.ca", + "368": "linktr.ee", + "369": "hollywoodreporter.com", + "370": "ietf.org", + "371": "clickbank.net", + "372": "harvard.edu", + "373": "amazon.es", + "374": "oup.com", + "375": "timeweb.ru", + "376": "engadget.com", + "377": "vice.com", + "378": "cornell.edu", + "379": "dreamstime.com", + "380": "tmz.com", + "381": "gofundme.com", + "382": "pbs.org", + "383": "stackoverflow.com", + "384": "abc.net.au", + "385": "sciencedirect.com", + "386": "ft.com", + "387": "variety.com", + "388": "alexa.com", + "389": "abc.es", + "390": "walmart.com", + "391": "gooyaabitemplates.com", + "392": "redbull.com", + "393": "ssl-images-amazon.com", + "394": "theverge.com", + "395": "spiegel.de", + "396": "about.com", + "397": "nationalgeographic.com", + "398": "bandcamp.com", + "399": "m.wikipedia.org", + "400": "zippyshare.com", + "401": "wired.com", + "402": "freepik.com", + "403": "outlook.com", + "404": "mit.edu", + "405": "sapo.pt", + "406": "goo.ne.jp", + "407": "java.com", + "408": "google.co.th", + "409": "scmp.com", + "410": "mayoclinic.org", + "411": "scholastic.com", + "412": "nba.com", + "413": "reverbnation.com", + "414": "depositfiles.com", + "415": "video.google.com", + "416": "howstuffworks.com", + "417": "cbslocal.com", + "418": "merriam-webster.com", + "419": "focus.de", + "420": "admin.ch", + "421": "gfycat.com", + "422": "com.com", + "423": "narod.ru", + "424": "boston.com", + "425": "sony.com", + "426": "justjared.com", + "427": "bitly.com", + "428": "jstor.org", + "429": "amebaownd.com", + "430": "g.co", + "431": "gsmarena.com", + "432": "lexpress.fr", + "433": "reddit.com", + "434": "usgs.gov", + "435": "bigcommerce.com", + "436": "gettyimages.com", + "437": "ign.com", + "438": "justgiving.com", + "439": "techradar.com", + "440": "weather.com", + "441": "amazon.ca", + "442": "justice.gov", + "443": "sciencemag.org", + "444": "pcmag.com", + "445": "theconversation.com", + "446": "foursquare.com", + "447": "flickr.com", + "448": "giphy.com", + "449": "tvtropes.org", + "450": "fifa.com", + "451": "upenn.edu", + "452": "digg.com", + "453": "bestfreecams.club", + "454": "histats.com", + "455": "salesforce.com", + "456": "blog.google", + "457": "apnews.com", + "458": "theglobeandmail.com", + "459": "m.me", + "460": "europapress.es", + "461": "washington.edu", + "462": "thefreedictionary.com", + "463": "jhu.edu", + "464": "euronews.com", + "465": "liberation.fr", + "466": "ads.google.com", + "467": "trustpilot.com", + "468": "google.com.tw", + "469": "softonic.com", + "470": "kakao.com", + "471": "storage.canalblog.com", + "472": "interia.pl", + "473": "metro.co.uk", + "474": "viglink.com", + "475": "last.fm", + "476": "blackberry.com", + "477": "public-api.wordpress.com", + "478": "sina.com.cn", + "479": "unicef.org", + "480": "archives.gov", + "481": "nps.gov", + "482": "utexas.edu", + "483": "biblegateway.com", + "484": "usda.gov", + "485": "indiegogo.com", + "486": "nikkei.com", + "487": "radiofrance.fr", + "488": "repubblica.it", + "489": "substack.com", + "490": "ap.org", + "491": "nicovideo.jp", + "492": "joomla.org", + "493": "news.com.au", + "494": "allaboutcookies.org", + "495": "mailchimp.com", + "496": "stores.jp", + "497": "intel.com", + "498": "bp0.blogger.com", + "499": "box.com", + "500": "nhk.or.jp" +} \ No newline at end of file From 7194fe6721003d312cd74ebb232d13960d13be4e Mon Sep 17 00:00:00 2001 From: Esa Jokinen Date: Wed, 17 Jul 2024 23:42:31 +0300 Subject: [PATCH 2/7] Sites as a class & add max age for the cache --- generator.py | 17 +++---- sites.py | 133 +++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 105 insertions(+), 45 deletions(-) diff --git a/generator.py b/generator.py index 51bd528..781c780 100644 --- a/generator.py +++ b/generator.py @@ -4,7 +4,7 @@ import re import sys -import sites +from sites import Sites from multiprocessing import Pool @@ -70,10 +70,12 @@ def genSecurityTxtForDomain( details["target"]: { "rank": details["rank"], "has_contact": details["has_contact"], - "has_dns_contact": True - if "dnssecuritytxt" in details - and details["dnssecuritytxt"]["security_contact"] is not None - else False, + "has_dns_contact": ( + True + if "dnssecuritytxt" in details + and details["dnssecuritytxt"]["security_contact"] is not None + else False + ), } } if "http_security_txt" in details and details["http_security_txt"] != {}: @@ -173,9 +175,8 @@ def genStaticFiles(results: dict): domains_dict = {} if os.environ.get("GET_SEC_TXT", "false") == "true": - domains_dict = ( - sites.top500 - ) # {nn: sites.top500[nn] for nn in list(sites.top500)[:10]} + sites = Sites() + domains_dict = sites.getTop500() if len(domains_dict) > 0: print("Got domain lists, counts:") diff --git a/sites.py b/sites.py index 2809422..18fab25 100644 --- a/sites.py +++ b/sites.py @@ -1,57 +1,116 @@ # Fetch top500 sites from https://moz.com/top500 with fallback to cache file import csv import json +import time +import os import urllib.request import urllib.error from json_logger import log -TOP_SITES_SOURCE = "https://moz.com/top-500/download?table=top500Domains" -CACHE_FILE = "top500.json" -top500 = {} +class Sites: + TOP_SITES_SOURCE = "https://moz.com/top-500/download?table=top500Domains" + USER_AGENT = "Mozilla/5.0 (OllieJC/findsecuritycontacts.com)" + CACHE_FILE = "top500.json" + CACHE_MAX_AGE = 86400 -try: - log(target="INFO", message=f"updating top500 from {TOP_SITES_SOURCE}") + def __init__(self): + self.top500 = {} - req = urllib.request.Request( - TOP_SITES_SOURCE, - data=None, - headers={"User-Agent": "Mozilla/5.0 (OllieJC/findsecuritycontacts.com)"}, - ) - resp = urllib.request.urlopen(req) - sites = csv.DictReader([l.decode("utf-8") for l in resp.readlines()]) + def __str__(self) -> str: + return f"{self.getTop500()}" - for site in sites: + def getTop500(self) -> dict: + if self.cacheAge() > Sites.CACHE_MAX_AGE: + self.refreshCache() + else: + self.readFromCache() + return self.top500 + + def cacheAge(self) -> int: + """Return the age of the cache file if available; otherwise assumes expired cache""" + try: + return time.time() - os.path.getmtime(Sites.CACHE_FILE) + except (OSError, Exception) as e: + log( + target="WARNING", + message="cache file age not found; assume no cache", + error=e, + ) + return Sites.CACHE_MAX_AGE + 1 + + def readFromCache(self): + """Reads the site list from the cache""" try: - top500[int(site["Rank"])] = site["Root Domain"] + log( + target="INFO", + message=f"reading sites from cache {Sites.CACHE_FILE}", + ) + with open(Sites.CACHE_FILE, "r") as cachefile: + self.top500 = { + int(k): str(v) for k, v in dict(json.load(cachefile)).items() + } except ValueError as e: + log(target="ERROR", message="unexpected cache file content", error=e) + except (OSError, Exception) as e: + log(target="ERROR", message="loading of cache file failed", error=e) + + def updateCache(self): + """Writes the current site list to the cache file""" + try: + log( + target="INFO", + message=f"saving updated top500 list as {Sites.CACHE_FILE}", + ) + with open(Sites.CACHE_FILE, "w+") as cachefile: + json.dump(self.top500, cachefile, indent=2) + except (OSError, Exception) as e: log(target="ERROR", error=e) - try: - log(target="INFO", message=f"saving updated top500 list as {CACHE_FILE}") - with open(CACHE_FILE, "w") as cachefile: - json.dump(top500, cachefile, indent=2) - except (OSError, Exception) as e: - log(target="ERROR", error=e) - -except ( - urllib.error.URLError, - urllib.error.HTTPError, - urllib.error.ContentTooShortError, -) as e: - log(target="ERROR", message="fetching top sites failed", error=e) - log(target="INFO", message=f"falling back to cache file {CACHE_FILE}") - - try: - with open(CACHE_FILE, "r") as cachefile: - top500 = {int(k): str(v) for k, v in dict(json.load(cachefile)).items()} - except ValueError as e: - log(target="ERROR", message="non-numerical keys in cache file", error=e) - except (OSError, Exception) as e: - log(target="ERROR", message="loading of cache file failed", error=e) + def refreshCache(self): + """Fetches the site list from the Internet and updates the cache""" + try: + log(target="INFO", message=f"updating top500 from {Sites.TOP_SITES_SOURCE}") + + req = urllib.request.Request( + Sites.TOP_SITES_SOURCE, + data=None, + headers={"User-Agent": Sites.USER_AGENT}, + ) + resp = urllib.request.urlopen(req) + sites = csv.DictReader([l.decode("utf-8") for l in resp.readlines()]) + + for site in sites: + try: + self.top500[int(site["Rank"])] = site["Root Domain"] + except ValueError as e: + log(target="ERROR", error=e) + + if len(self.top500) > 0: + self.updateCache() + else: + log( + target="WARNING", + message="fetched list empty; falling back to cache file", + ) + self.readFromCache() + + except ( + urllib.error.URLError, + urllib.error.HTTPError, + urllib.error.ContentTooShortError, + KeyError, + ) as e: + if isinstance(e, KeyError): + log(target="ERROR", message="feched unexcepted content", error=e) + else: + log(target="ERROR", message="fetching top sites failed", error=e) + log(target="INFO", message=f"falling back to cache file {Sites.CACHE_FILE}") + self.readFromCache() if __name__ == "__main__": """Print the resulting top500 dictionary when called directly""" - print(top500) + sites = Sites() + print(sites) From 5761a962d054f1cd1c813a24f1794ccbdf01a4cd Mon Sep 17 00:00:00 2001 From: Esa Jokinen Date: Wed, 17 Jul 2024 23:46:39 +0300 Subject: [PATCH 3/7] Fix undefined results_list in case GET_SEC_TXT = false --- generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generator.py b/generator.py index 781c780..c7ded40 100644 --- a/generator.py +++ b/generator.py @@ -173,6 +173,7 @@ def genStaticFiles(results: dict): genSecurityTxtForDomain((0, domain), gen_sites) else: domains_dict = {} + results_list = [] if os.environ.get("GET_SEC_TXT", "false") == "true": sites = Sites() @@ -184,7 +185,6 @@ def genStaticFiles(results: dict): else: raise Exception("No domains") - results_list = [] with Pool(int(os.environ.get("POOL_SIZE", os.cpu_count()))) as p: results_list = p.map(genSecurityTxtForDomain, domains_dict.items()) From 24fb0ce5b795b9c0d712314238e2a880c94dca7e Mon Sep 17 00:00:00 2001 From: Esa Jokinen Date: Thu, 18 Jul 2024 00:27:01 +0300 Subject: [PATCH 4/7] Do not repeat old messages & errors on new log lines --- json_logger.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/json_logger.py b/json_logger.py index c8606a3..d1b146a 100644 --- a/json_logger.py +++ b/json_logger.py @@ -8,13 +8,11 @@ def log( ): obj.update({"time": time.time()}) - if target is not None and message is not None: - obj.update({"target": target, "message": str(message)}) - elif target is not None: - obj.update({"message": str(target)}) + if target is not None: + obj.update({"target": str(target)}) - if error is not None: - obj.update({"error": str(error)}) + obj.update({"message": str(message)}) + obj.update({"error": str(error)}) print( json.dumps(obj, default=str), file=sys.stdout if error is None else sys.stderr From ed25a4f76b91071f7ca8c4e262a096efe960023e Mon Sep 17 00:00:00 2001 From: Esa Jokinen Date: Thu, 18 Jul 2024 00:27:08 +0300 Subject: [PATCH 5/7] All log messages as JSON --- generator.py | 8 +++++--- jinja_helper.py | 6 +++++- sites.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/generator.py b/generator.py index c7ded40..1838c74 100644 --- a/generator.py +++ b/generator.py @@ -24,7 +24,7 @@ def setupDist(): try: shutil.rmtree(dist) except Exception as e: - print(e) + log(target="ERROR", message=f"removing {dist} failed", error=e) os.mkdir(dist) os.mkdir(top_sites) @@ -180,8 +180,10 @@ def genStaticFiles(results: dict): domains_dict = sites.getTop500() if len(domains_dict) > 0: - print("Got domain lists, counts:") - print("Total -", len(domains_dict)) + log( + target="INFO", + message="Got domain lists; total count {len(domains_dict)}", + ) else: raise Exception("No domains") diff --git a/jinja_helper.py b/jinja_helper.py index 09e5894..f461850 100644 --- a/jinja_helper.py +++ b/jinja_helper.py @@ -7,6 +7,8 @@ import json import subresource_integrity as integrity +from json_logger import log + def colourFromLetter(letter: str = "") -> str: if not letter: @@ -94,7 +96,9 @@ def renderTemplate( params.update({"title": params["dest_domain"]}) else: description = "" - print(f"Skipping canonical and description for: {filename}") + log( + target="INFO", message=f"Skipping canonical and description for: {filename}" + ) params.update({"description": description}) params.update({"canonical": canonical}) diff --git a/sites.py b/sites.py index 18fab25..9bdf782 100644 --- a/sites.py +++ b/sites.py @@ -19,7 +19,7 @@ def __init__(self): self.top500 = {} def __str__(self) -> str: - return f"{self.getTop500()}" + return f"{json.dumps(self.getTop500())}" def getTop500(self) -> dict: if self.cacheAge() > Sites.CACHE_MAX_AGE: From 165f10c80524c989e6829e418da5246f5883ae20 Mon Sep 17 00:00:00 2001 From: Esa Jokinen Date: Thu, 18 Jul 2024 00:52:37 +0300 Subject: [PATCH 6/7] Lower cache max age to 18 hours Since the GitHub workflow updates the site daily at the same time, the CACHE_MAX_AGE has to be lower than 24 hours. --- sites.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sites.py b/sites.py index 9bdf782..f716301 100644 --- a/sites.py +++ b/sites.py @@ -13,7 +13,7 @@ class Sites: TOP_SITES_SOURCE = "https://moz.com/top-500/download?table=top500Domains" USER_AGENT = "Mozilla/5.0 (OllieJC/findsecuritycontacts.com)" CACHE_FILE = "top500.json" - CACHE_MAX_AGE = 86400 + CACHE_MAX_AGE = 64800 # 18 hours def __init__(self): self.top500 = {} From 78a81b0dfd19ed3615722c1a1cff8fda93bfdc21 Mon Sep 17 00:00:00 2001 From: Esa Jokinen Date: Sun, 22 Sep 2024 08:56:23 +0300 Subject: [PATCH 7/7] Add a newline at the end of the cache file --- sites.py | 1 + top500.json | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/sites.py b/sites.py index f716301..f29c57f 100644 --- a/sites.py +++ b/sites.py @@ -65,6 +65,7 @@ def updateCache(self): ) with open(Sites.CACHE_FILE, "w+") as cachefile: json.dump(self.top500, cachefile, indent=2) + cachefile.write("\n") except (OSError, Exception) as e: log(target="ERROR", error=e) diff --git a/top500.json b/top500.json index 9031a34..fb60cc8 100644 --- a/top500.json +++ b/top500.json @@ -499,4 +499,4 @@ "498": "bp0.blogger.com", "499": "box.com", "500": "nhk.or.jp" -} \ No newline at end of file +}