From 33954fd45bd6a36719051ce1e435add1bb3759b5 Mon Sep 17 00:00:00 2001
From: CasualYT31 <21147925+CasualYT31@users.noreply.github.com>
Date: Mon, 23 Dec 2024 15:54:22 +0000
Subject: [PATCH] Support downloading videos and audio
Respond to comments
Improve archiving and file naming
---
gallery_dl/extractor/tiktok.py | 71 ++++++++++++++++------
gallery_dl/text.py | 15 -----
test/results/tiktok.py | 107 +++++++++++++++++++++++++++++----
test/test_text.py | 24 --------
4 files changed, 147 insertions(+), 70 deletions(-)
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
index 0dbbedf856..787ee65f3e 100644
--- a/gallery_dl/extractor/tiktok.py
+++ b/gallery_dl/extractor/tiktok.py
@@ -22,8 +22,8 @@ class TiktokExtractor(Extractor):
category = "tiktok"
directory_fmt = ("{category}", "{user}")
- filename_fmt = "{title} [{id}] [{index}].{extension}"
- archive_fmt = "{id}_{img_id}"
+ filename_fmt = "{title} [{id}{index:?_//}{img_id:?_//}].{extension}"
+ archive_fmt = "{id}_{index}_{img_id}"
root = "https://www.tiktok.com/"
cookies_domain = ".tiktok.com"
@@ -31,16 +31,17 @@ def urls(self):
return [self.url]
def items(self):
+ videos = self.config("videos", True)
for tiktok_url in self.urls():
# If we can recognise that this is a /photo/ link, preemptively
# replace it with /video/ to prevent a needless second request.
# See below.
- tiktok_url = compile(
+ tiktok_url_to_use = compile(
escape("/photo/"),
IGNORECASE
).sub("/video/", tiktok_url)
video_detail = util.json_loads(text.extr(
- self.request(tiktok_url).text,
+ self.request(tiktok_url_to_use).text,
''
@@ -48,20 +49,40 @@ def items(self):
if "webapp.video-detail" not in video_detail:
# Only /video/ links result in the video-detail dict we need.
# Try again using that form of link.
- tiktok_url = video_detail["seo.abtest"]["canonical"] \
+ tiktok_url_to_use = video_detail["seo.abtest"]["canonical"] \
.replace("/photo/", "/video/")
video_detail = util.json_loads(text.extr(
- self.request(tiktok_url).text,
+ self.request(tiktok_url_to_use).text,
''
))["__DEFAULT_SCOPE__"]
video_detail = video_detail["webapp.video-detail"]
- has_status = "statusMsg" in video_detail
- if has_status and video_detail["statusMsg"] == "author_secret":
- raise exception.AuthorizationError("Login required to access "
- "this post")
+ if "statusCode" in video_detail:
+ if video_detail["statusCode"] == 10222:
+ raise exception.AuthorizationError(
+ tiktok_url + ": Login required to access this post"
+ )
+ elif video_detail["statusCode"] == 10204:
+ raise exception.NotFoundError(tiktok_url)
+ elif video_detail["statusCode"] == 10231:
+ raise exception.ExtractionError(
+ tiktok_url + " is region locked, try downloading with "
+ "a VPN/proxy connection"
+ )
+ elif video_detail["statusCode"] != 0:
+ raise exception.ExtractionError(
+ tiktok_url + ": Received unknown error code " +
+ str(video_detail['statusCode']) + " with message " +
+ (video_detail['statusMsg'] if
+ "statusMsg" in video_detail else "")
+ )
post_info = video_detail["itemInfo"]["itemStruct"]
+ id = post_info["id"]
+ original_title = title = post_info["desc"]
+ if len(original_title) == 0:
+ title = "TikTok photo #{}".format(id)
+ title = title[:150]
user = post_info["author"]["uniqueId"]
if "imagePost" in post_info:
yield Message.Directory, {"user": user}
@@ -69,22 +90,36 @@ def items(self):
for i, img in enumerate(img_list):
url = img["imageURL"]["urlList"][0]
name_and_ext = text.nameext_from_url(url)
- id = post_info["id"]
- title = post_info["desc"]
- if len(title) == 0:
- title = "TikTok photo #{}".format(id)
yield Message.Url, url, {
- "title" : text.sanitize_for_filename(title)[:170],
+ "title" : title,
"id" : id,
- "index" : i,
+ "index" : i + 1,
"img_id" : name_and_ext["filename"].split("~")[0],
"extension" : name_and_ext["extension"],
"width" : img["imageWidth"],
"height" : img["imageHeight"]
}
+ elif videos:
+ # It's probably obvious but I thought it was worth noting
+ # because I got stuck on this for a while: make sure to emit
+ # a Directory message before attempting to download anything
+ # with yt-dlp! Otherwise you'll run into NoneType, set_filename
+ # errors since the download job doesn't get initialized.
+ yield Message.Directory, {"user": user}
+ if len(original_title) == 0:
+ title = "TikTok video #{}".format(id)
+ title = title[:150]
else:
- # TODO: Not a slide show. Should pass this to yt-dlp.
- pass
+ self.log.info("Skipping video post %s", tiktok_url)
+ if videos:
+ yield Message.Url, "ytdl:" + tiktok_url_to_use, {
+ "filename" : "",
+ "extension" : "",
+ "title" : title,
+ "id" : id,
+ "index" : "",
+ "img_id" : ""
+ }
class TiktokPostExtractor(TiktokExtractor):
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index 246efee320..5fd5a40715 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -51,21 +51,6 @@ def slugify(value):
return re.sub(r"[-\s]+", "-", value).strip("-_")
-def sanitize_for_filename(string):
- """Removes characters from a string that would be illegal to have in
- a filename
-
- This function is similar to slugify(), except it retains more
- characters (notably characters such as # and @).
-
- Note that the length of the string is not capped!
-
- Inspiration:
- https://stackoverflow.com/a/71199182
- """
- return re.sub(r"[/\\?%*:|\"<>\x7F\x00-\x1F]", " ", str(string))
-
-
def ensure_http_scheme(url, scheme="https://"):
"""Prepend 'scheme' to 'url' if it doesn't have one"""
if url and not url.startswith(("https://", "http://")):
diff --git a/test/results/tiktok.py b/test/results/tiktok.py
index 43221764d6..4bdbd9bdab 100644
--- a/test/results/tiktok.py
+++ b/test/results/tiktok.py
@@ -5,66 +5,147 @@
# published by the Free Software Foundation.
from gallery_dl.extractor import tiktok
+from gallery_dl import exception
PATTERN = r"https://p1[69]-.*\.tiktokcdn.*\.com/.*/[0-9a-fA-F]+~.*\.jpeg"
+PATTERN_WITH_AUDIO = r"(?:" + PATTERN + r")|(?:ytdl\:)"
__tests__ = (
-# Test many photos.
{
"#url" : "https://www.tiktok.com/@chillezy/photo/7240568259186019630",
+ "#comment" : "/photo/ link: many photos",
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
- "#pattern" : PATTERN
+ "#pattern" : PATTERN,
+ "#options" : {"videos": False}
},
{
"#url" : "https://www.tiktok.com/@chillezy/video/7240568259186019630",
+ "#comment" : "/video/ link: many photos",
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
- "#pattern" : PATTERN
+ "#pattern" : PATTERN,
+ "#options" : {"videos": False}
},
{
"#url" : "https://vm.tiktok.com/ZGdh4WUhr/",
+ "#comment" : "vm.tiktok.com link: many photos",
"#category" : ("", "tiktok", "vmpost"),
"#class" : tiktok.TiktokVmpostExtractor,
- "#pattern" : PATTERN
+ "#pattern" : PATTERN,
+ "#options" : {"videos": False}
},
-# Test one photo.
{
"#url" : "https://www.tiktok.com/@d4vinefem/photo/7449575367024626974",
+ "#comment" : "/photo/ link: single photo",
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
- "#pattern" : PATTERN
+ "#pattern" : PATTERN,
+ "#options" : {"videos": False}
},
{
"#url" : "https://www.tiktok.com/@d4vinefem/video/7449575367024626974",
+ "#comment" : "/video/ link: single photo",
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
- "#pattern" : PATTERN
+ "#pattern" : PATTERN,
+ "#options" : {"videos": False}
},
{
"#url" : "https://vm.tiktok.com/ZGdhVtER2/",
+ "#comment" : "vm.tiktok.com link: single photo",
"#category" : ("", "tiktok", "vmpost"),
"#class" : tiktok.TiktokVmpostExtractor,
- "#pattern" : PATTERN
+ "#pattern" : PATTERN,
+ "#options" : {"videos": False}
},
-# Test a few photos.
{
"#url" : "https://www.tiktok.com/@.mcfc.central/photo/7449701420934122785",
+ "#comment" : "/photo/ link: few photos",
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
- "#pattern" : PATTERN
+ "#pattern" : PATTERN,
+ "#options" : {"videos": False}
},
{
"#url" : "https://www.tiktok.com/@.mcfc.central/video/7449701420934122785",
+ "#comment" : "/video/ link: few photos",
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
- "#pattern" : PATTERN
+ "#pattern" : PATTERN,
+ "#options" : {"videos": False}
},
{
"#url" : "https://vm.tiktok.com/ZGdhVW3cu/",
+ "#comment" : "vm.tiktok.com link: few photos",
"#category" : ("", "tiktok", "vmpost"),
"#class" : tiktok.TiktokVmpostExtractor,
- "#pattern" : PATTERN
-}
+ "#pattern" : PATTERN,
+ "#options" : {"videos": False}
+},
+{
+ "#url" : "https://www.tiktok.com/@ughuwhguweghw/video/1",
+ "#comment" : "deleted post",
+ "#category" : ("", "tiktok", "post"),
+ "#class" : tiktok.TiktokPostExtractor,
+ "#exception" : exception.NotFoundError,
+ "#options" : {"videos": False}
+},
+{
+ "#url" : "https://www.tiktok.com/@memezar/video/7449708266168274208",
+ "#comment" : "Video post",
+ "#category" : ("", "tiktok", "post"),
+ "#class" : tiktok.TiktokPostExtractor,
+ "#urls" : "ytdl:https://www.tiktok.com/@memezar/video/7449708266168274208",
+ "#options" : {"videos": True}
+},
+{
+ "#url" : "https://www.tiktok.com/@memezar/photo/7449708266168274208",
+ "#comment" : "Video post as a /photo/ link",
+ "#category" : ("", "tiktok", "post"),
+ "#class" : tiktok.TiktokPostExtractor,
+ "#urls" : "ytdl:https://www.tiktok.com/@memezar/video/7449708266168274208",
+ "#options" : {"videos": True}
+},
+{
+ "#url" : "https://vm.tiktok.com/ZGdht7cjp/",
+ "#comment" : "Video post as a VM link",
+ "#category" : ("", "tiktok", "vmpost"),
+ "#class" : tiktok.TiktokVmpostExtractor,
+ "#urls" : "ytdl:https://vm.tiktok.com/ZGdht7cjp/",
+ "#options" : {"videos": True}
+},
+{
+ "#url" : "https://www.tiktok.com/@memezar/video/7449708266168274208",
+ "#comment" : "Skipping video post",
+ "#category" : ("", "tiktok", "post"),
+ "#class" : tiktok.TiktokPostExtractor,
+ "#urls" : [],
+ "#options" : {"videos": False}
+},
+{
+ "#url" : "https://www.tiktok.com/@chillezy/photo/7240568259186019630",
+ "#comment" : "/photo/ link: many photos with audio",
+ "#category" : ("", "tiktok", "post"),
+ "#class" : tiktok.TiktokPostExtractor,
+ "#pattern" : PATTERN_WITH_AUDIO,
+ "#options" : {"videos": True}
+},
+{
+ "#url" : "https://www.tiktok.com/@chillezy/video/7240568259186019630",
+ "#comment" : "/video/ link: many photos with audio",
+ "#category" : ("", "tiktok", "post"),
+ "#class" : tiktok.TiktokPostExtractor,
+ "#pattern" : PATTERN_WITH_AUDIO,
+ "#options" : {"videos": True}
+},
+{
+ "#url" : "https://vm.tiktok.com/ZGdh4WUhr/",
+ "#comment" : "vm.tiktok.com link: many photos with audio",
+ "#category" : ("", "tiktok", "vmpost"),
+ "#class" : tiktok.TiktokVmpostExtractor,
+ "#pattern" : PATTERN_WITH_AUDIO,
+ "#options" : {"videos": True}
+},
)
diff --git a/test/test_text.py b/test/test_text.py
index 5b97db7f91..1b19c4742a 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -92,30 +92,6 @@ def test_slugify(self, f=text.slugify):
self.assertEqual(f(1), "1")
self.assertEqual(f(2.3), "23")
- def test_sanitize_for_filename(self, f=text.sanitize_for_filename):
- self.assertEqual(f("Hello World"), "Hello World")
- self.assertEqual(f("-HeLLo---World-"), "-HeLLo---World-")
- self.assertEqual(
- f("_-H#e:l#l:o+\t+W?o!rl=d-_"),
- "_-H#e l#l o+ +W o!rl=d-_"
- )
- self.assertEqual(f("_Hello_World_"), "_Hello_World_")
- self.assertEqual(
- f("/\\?%*:|\"<>\x7F\x00\x0B\x1F"),
- " "
- )
-
- self.assertEqual(f(""), "")
- self.assertEqual(f("-"), "-")
- self.assertEqual(f("--"), "--")
-
- self.assertEqual(f(()), "()")
- self.assertEqual(f([]), "[]")
- self.assertEqual(f({}), "{}")
- self.assertEqual(f(None), "None")
- self.assertEqual(f(1), "1")
- self.assertEqual(f(2.3), "2.3")
-
def test_ensure_http_scheme(self, f=text.ensure_http_scheme):
result = "https://example.org/filename.ext"