From 33954fd45bd6a36719051ce1e435add1bb3759b5 Mon Sep 17 00:00:00 2001 From: CasualYT31 <21147925+CasualYT31@users.noreply.github.com> Date: Mon, 23 Dec 2024 15:54:22 +0000 Subject: [PATCH] Support downloading videos and audio Respond to comments Improve archiving and file naming --- gallery_dl/extractor/tiktok.py | 71 ++++++++++++++++------ gallery_dl/text.py | 15 ----- test/results/tiktok.py | 107 +++++++++++++++++++++++++++++---- test/test_text.py | 24 -------- 4 files changed, 147 insertions(+), 70 deletions(-) diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py index 0dbbedf856..787ee65f3e 100644 --- a/gallery_dl/extractor/tiktok.py +++ b/gallery_dl/extractor/tiktok.py @@ -22,8 +22,8 @@ class TiktokExtractor(Extractor): category = "tiktok" directory_fmt = ("{category}", "{user}") - filename_fmt = "{title} [{id}] [{index}].{extension}" - archive_fmt = "{id}_{img_id}" + filename_fmt = "{title} [{id}{index:?_//}{img_id:?_//}].{extension}" + archive_fmt = "{id}_{index}_{img_id}" root = "https://www.tiktok.com/" cookies_domain = ".tiktok.com" @@ -31,16 +31,17 @@ def urls(self): return [self.url] def items(self): + videos = self.config("videos", True) for tiktok_url in self.urls(): # If we can recognise that this is a /photo/ link, preemptively # replace it with /video/ to prevent a needless second request. # See below. - tiktok_url = compile( + tiktok_url_to_use = compile( escape("/photo/"), IGNORECASE ).sub("/video/", tiktok_url) video_detail = util.json_loads(text.extr( - self.request(tiktok_url).text, + self.request(tiktok_url_to_use).text, '' @@ -48,20 +49,40 @@ def items(self): if "webapp.video-detail" not in video_detail: # Only /video/ links result in the video-detail dict we need. # Try again using that form of link. - tiktok_url = video_detail["seo.abtest"]["canonical"] \ + tiktok_url_to_use = video_detail["seo.abtest"]["canonical"] \ .replace("/photo/", "/video/") video_detail = util.json_loads(text.extr( - self.request(tiktok_url).text, + self.request(tiktok_url_to_use).text, '' ))["__DEFAULT_SCOPE__"] video_detail = video_detail["webapp.video-detail"] - has_status = "statusMsg" in video_detail - if has_status and video_detail["statusMsg"] == "author_secret": - raise exception.AuthorizationError("Login required to access " - "this post") + if "statusCode" in video_detail: + if video_detail["statusCode"] == 10222: + raise exception.AuthorizationError( + tiktok_url + ": Login required to access this post" + ) + elif video_detail["statusCode"] == 10204: + raise exception.NotFoundError(tiktok_url) + elif video_detail["statusCode"] == 10231: + raise exception.ExtractionError( + tiktok_url + " is region locked, try downloading with " + "a VPN/proxy connection" + ) + elif video_detail["statusCode"] != 0: + raise exception.ExtractionError( + tiktok_url + ": Received unknown error code " + + str(video_detail['statusCode']) + " with message " + + (video_detail['statusMsg'] if + "statusMsg" in video_detail else "") + ) post_info = video_detail["itemInfo"]["itemStruct"] + id = post_info["id"] + original_title = title = post_info["desc"] + if len(original_title) == 0: + title = "TikTok photo #{}".format(id) + title = title[:150] user = post_info["author"]["uniqueId"] if "imagePost" in post_info: yield Message.Directory, {"user": user} @@ -69,22 +90,36 @@ def items(self): for i, img in enumerate(img_list): url = img["imageURL"]["urlList"][0] name_and_ext = text.nameext_from_url(url) - id = post_info["id"] - title = post_info["desc"] - if len(title) == 0: - title = "TikTok photo #{}".format(id) yield Message.Url, url, { - "title" : text.sanitize_for_filename(title)[:170], + "title" : title, "id" : id, - "index" : i, + "index" : i + 1, "img_id" : name_and_ext["filename"].split("~")[0], "extension" : name_and_ext["extension"], "width" : img["imageWidth"], "height" : img["imageHeight"] } + elif videos: + # It's probably obvious but I thought it was worth noting + # because I got stuck on this for a while: make sure to emit + # a Directory message before attempting to download anything + # with yt-dlp! Otherwise you'll run into NoneType, set_filename + # errors since the download job doesn't get initialized. + yield Message.Directory, {"user": user} + if len(original_title) == 0: + title = "TikTok video #{}".format(id) + title = title[:150] else: - # TODO: Not a slide show. Should pass this to yt-dlp. - pass + self.log.info("Skipping video post %s", tiktok_url) + if videos: + yield Message.Url, "ytdl:" + tiktok_url_to_use, { + "filename" : "", + "extension" : "", + "title" : title, + "id" : id, + "index" : "", + "img_id" : "" + } class TiktokPostExtractor(TiktokExtractor): diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 246efee320..5fd5a40715 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -51,21 +51,6 @@ def slugify(value): return re.sub(r"[-\s]+", "-", value).strip("-_") -def sanitize_for_filename(string): - """Removes characters from a string that would be illegal to have in - a filename - - This function is similar to slugify(), except it retains more - characters (notably characters such as # and @). - - Note that the length of the string is not capped! - - Inspiration: - https://stackoverflow.com/a/71199182 - """ - return re.sub(r"[/\\?%*:|\"<>\x7F\x00-\x1F]", " ", str(string)) - - def ensure_http_scheme(url, scheme="https://"): """Prepend 'scheme' to 'url' if it doesn't have one""" if url and not url.startswith(("https://", "http://")): diff --git a/test/results/tiktok.py b/test/results/tiktok.py index 43221764d6..4bdbd9bdab 100644 --- a/test/results/tiktok.py +++ b/test/results/tiktok.py @@ -5,66 +5,147 @@ # published by the Free Software Foundation. from gallery_dl.extractor import tiktok +from gallery_dl import exception PATTERN = r"https://p1[69]-.*\.tiktokcdn.*\.com/.*/[0-9a-fA-F]+~.*\.jpeg" +PATTERN_WITH_AUDIO = r"(?:" + PATTERN + r")|(?:ytdl\:)" __tests__ = ( -# Test many photos. { "#url" : "https://www.tiktok.com/@chillezy/photo/7240568259186019630", + "#comment" : "/photo/ link: many photos", "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, - "#pattern" : PATTERN + "#pattern" : PATTERN, + "#options" : {"videos": False} }, { "#url" : "https://www.tiktok.com/@chillezy/video/7240568259186019630", + "#comment" : "/video/ link: many photos", "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, - "#pattern" : PATTERN + "#pattern" : PATTERN, + "#options" : {"videos": False} }, { "#url" : "https://vm.tiktok.com/ZGdh4WUhr/", + "#comment" : "vm.tiktok.com link: many photos", "#category" : ("", "tiktok", "vmpost"), "#class" : tiktok.TiktokVmpostExtractor, - "#pattern" : PATTERN + "#pattern" : PATTERN, + "#options" : {"videos": False} }, -# Test one photo. { "#url" : "https://www.tiktok.com/@d4vinefem/photo/7449575367024626974", + "#comment" : "/photo/ link: single photo", "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, - "#pattern" : PATTERN + "#pattern" : PATTERN, + "#options" : {"videos": False} }, { "#url" : "https://www.tiktok.com/@d4vinefem/video/7449575367024626974", + "#comment" : "/video/ link: single photo", "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, - "#pattern" : PATTERN + "#pattern" : PATTERN, + "#options" : {"videos": False} }, { "#url" : "https://vm.tiktok.com/ZGdhVtER2/", + "#comment" : "vm.tiktok.com link: single photo", "#category" : ("", "tiktok", "vmpost"), "#class" : tiktok.TiktokVmpostExtractor, - "#pattern" : PATTERN + "#pattern" : PATTERN, + "#options" : {"videos": False} }, -# Test a few photos. { "#url" : "https://www.tiktok.com/@.mcfc.central/photo/7449701420934122785", + "#comment" : "/photo/ link: few photos", "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, - "#pattern" : PATTERN + "#pattern" : PATTERN, + "#options" : {"videos": False} }, { "#url" : "https://www.tiktok.com/@.mcfc.central/video/7449701420934122785", + "#comment" : "/video/ link: few photos", "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, - "#pattern" : PATTERN + "#pattern" : PATTERN, + "#options" : {"videos": False} }, { "#url" : "https://vm.tiktok.com/ZGdhVW3cu/", + "#comment" : "vm.tiktok.com link: few photos", "#category" : ("", "tiktok", "vmpost"), "#class" : tiktok.TiktokVmpostExtractor, - "#pattern" : PATTERN -} + "#pattern" : PATTERN, + "#options" : {"videos": False} +}, +{ + "#url" : "https://www.tiktok.com/@ughuwhguweghw/video/1", + "#comment" : "deleted post", + "#category" : ("", "tiktok", "post"), + "#class" : tiktok.TiktokPostExtractor, + "#exception" : exception.NotFoundError, + "#options" : {"videos": False} +}, +{ + "#url" : "https://www.tiktok.com/@memezar/video/7449708266168274208", + "#comment" : "Video post", + "#category" : ("", "tiktok", "post"), + "#class" : tiktok.TiktokPostExtractor, + "#urls" : "ytdl:https://www.tiktok.com/@memezar/video/7449708266168274208", + "#options" : {"videos": True} +}, +{ + "#url" : "https://www.tiktok.com/@memezar/photo/7449708266168274208", + "#comment" : "Video post as a /photo/ link", + "#category" : ("", "tiktok", "post"), + "#class" : tiktok.TiktokPostExtractor, + "#urls" : "ytdl:https://www.tiktok.com/@memezar/video/7449708266168274208", + "#options" : {"videos": True} +}, +{ + "#url" : "https://vm.tiktok.com/ZGdht7cjp/", + "#comment" : "Video post as a VM link", + "#category" : ("", "tiktok", "vmpost"), + "#class" : tiktok.TiktokVmpostExtractor, + "#urls" : "ytdl:https://vm.tiktok.com/ZGdht7cjp/", + "#options" : {"videos": True} +}, +{ + "#url" : "https://www.tiktok.com/@memezar/video/7449708266168274208", + "#comment" : "Skipping video post", + "#category" : ("", "tiktok", "post"), + "#class" : tiktok.TiktokPostExtractor, + "#urls" : [], + "#options" : {"videos": False} +}, +{ + "#url" : "https://www.tiktok.com/@chillezy/photo/7240568259186019630", + "#comment" : "/photo/ link: many photos with audio", + "#category" : ("", "tiktok", "post"), + "#class" : tiktok.TiktokPostExtractor, + "#pattern" : PATTERN_WITH_AUDIO, + "#options" : {"videos": True} +}, +{ + "#url" : "https://www.tiktok.com/@chillezy/video/7240568259186019630", + "#comment" : "/video/ link: many photos with audio", + "#category" : ("", "tiktok", "post"), + "#class" : tiktok.TiktokPostExtractor, + "#pattern" : PATTERN_WITH_AUDIO, + "#options" : {"videos": True} +}, +{ + "#url" : "https://vm.tiktok.com/ZGdh4WUhr/", + "#comment" : "vm.tiktok.com link: many photos with audio", + "#category" : ("", "tiktok", "vmpost"), + "#class" : tiktok.TiktokVmpostExtractor, + "#pattern" : PATTERN_WITH_AUDIO, + "#options" : {"videos": True} +}, ) diff --git a/test/test_text.py b/test/test_text.py index 5b97db7f91..1b19c4742a 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -92,30 +92,6 @@ def test_slugify(self, f=text.slugify): self.assertEqual(f(1), "1") self.assertEqual(f(2.3), "23") - def test_sanitize_for_filename(self, f=text.sanitize_for_filename): - self.assertEqual(f("Hello World"), "Hello World") - self.assertEqual(f("-HeLLo---World-"), "-HeLLo---World-") - self.assertEqual( - f("_-H#e:l#l:o+\t+W?o!rl=d-_"), - "_-H#e l#l o+ +W o!rl=d-_" - ) - self.assertEqual(f("_Hello_World_"), "_Hello_World_") - self.assertEqual( - f("/\\?%*:|\"<>\x7F\x00\x0B\x1F"), - " " - ) - - self.assertEqual(f(""), "") - self.assertEqual(f("-"), "-") - self.assertEqual(f("--"), "--") - - self.assertEqual(f(()), "()") - self.assertEqual(f([]), "[]") - self.assertEqual(f({}), "{}") - self.assertEqual(f(None), "None") - self.assertEqual(f(1), "1") - self.assertEqual(f(2.3), "2.3") - def test_ensure_http_scheme(self, f=text.ensure_http_scheme): result = "https://example.org/filename.ext"