From b903aeb7e60f8bfeefb45f458e4ebd018df858a9 Mon Sep 17 00:00:00 2001 From: Pahud Hsieh Date: Sat, 17 Aug 2024 16:41:12 +0000 Subject: [PATCH 1/3] support headers --- README.md | 8 ++++++++ youtube_transcript_api/_api.py | 5 ++++- youtube_transcript_api/test/test_api.py | 6 ++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 337b875..a66cfd0 100644 --- a/README.md +++ b/README.md @@ -371,6 +371,14 @@ Using the CLI: youtube_transcript_api --cookies /path/to/your/cookies.txt ``` +## Headers + +You can pass additional headers to the `list_transcripts` method by providing a dictionary of headers as the `headers` parameter. This can be useful for various purposes, such as specifying the accepted encoding, setting a custom user agent, or including any other necessary headers for the request. + +```python +headers = {'Accept-Encoding': 'gzip, deflate'} +transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, headers=headers) +``` ## Warning diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 24a1236..4a02bd7 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -16,7 +16,7 @@ class YouTubeTranscriptApi(object): @classmethod - def list_transcripts(cls, video_id, proxies=None, cookies=None): + def list_transcripts(cls, video_id, proxies=None, cookies=None, headers=None): """ Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating @@ -61,6 +61,8 @@ def list_transcripts(cls, video_id, proxies=None, cookies=None): :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies :type cookies: str + :param headers: a dictionary of additional headers to include in the requests + :type headers: dict :return: the list of available transcripts :rtype TranscriptList: """ @@ -68,6 +70,7 @@ def list_transcripts(cls, video_id, proxies=None, cookies=None): if cookies: http_client.cookies = cls._load_cookies(cookies, video_id) http_client.proxies = proxies if proxies else {} + http_client.headers.update(headers) if headers else {} return TranscriptListFetcher(http_client).fetch(video_id) @classmethod diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 9b5e732..6527fd8 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -291,6 +291,12 @@ def test_get_transcript__with_cookies(self): ] ) + def test_get_transcript__with_headers(self): + headers = {'Accept-Encoding': 'gzip, deflate'} + transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8', headers=headers) + language_codes = {transcript.language_code for transcript in transcript_list} + self.assertGreater(len(language_codes), 0) + def test_get_transcript__assertionerror_if_input_not_string(self): with self.assertRaises(AssertionError): YouTubeTranscriptApi.get_transcript(['video_id_1', 'video_id_2']) From c4b7e0f185fc4bf83c1fd36b3312318abc91059c Mon Sep 17 00:00:00 2001 From: Pahud Hsieh Date: Sun, 18 Aug 2024 13:27:10 +0000 Subject: [PATCH 2/3] minor --- youtube_transcript_api/_transcripts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index ef1f44b..160513a 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -288,7 +288,8 @@ def fetch(self, preserve_formatting=False): :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype [{'text': str, 'start': float, 'end': float}]: """ - response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'}) + self._http_client.headers.update({'Accept-Language': 'en-US'}) + response = self._http_client.get(self._url) return _TranscriptParser(preserve_formatting=preserve_formatting).parse( _raise_http_errors(response, self.video_id).text, ) From eef2fe340e11e49031b1ee3eec76c6bb1d3ea671 Mon Sep 17 00:00:00 2001 From: Pahud Hsieh Date: Mon, 19 Aug 2024 03:38:04 +0000 Subject: [PATCH 3/3] minor --- youtube_transcript_api/_transcripts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 160513a..60bc4d5 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -88,7 +88,8 @@ def _fetch_video_html(self, video_id): return html def _fetch_html(self, video_id): - response = self._http_client.get(WATCH_URL.format(video_id=video_id), headers={'Accept-Language': 'en-US'}) + self._http_client.headers.update({'Accept-Language': 'en-US'}) + response = self._http_client.get(WATCH_URL.format(video_id=video_id)) return unescape(_raise_http_errors(response, video_id).text)