Use requests streaming mode to avoid loading large URLs into memory

harvard-lil · Jan 16, 2025 · f1fd733 · f1fd733
1 parent da35002
commit f1fd733
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 2 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -0,0 +1,26 @@
+# Changelog
+
+## [Unreleased]
+
+### Added
+- 
+
+### Changed
+- 
+
+### Deprecated
+- 
+
+### Removed
+- 
+
+### Fixed
+- 
+
+### Security
+- 
+
+## [0.1.2] - 2025-01-16
+
+### Fixed
+- Use requests streaming mode to avoid loading large URLs into memory
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "nabit"
-version = "0.1.1"
+version = "0.1.2"
 description = "Archive and sign datasets"
 readme = "README.md"
 requires-python = ">=3.11"

diff --git a/src/nabit/lib/backends/url.py b/src/nabit/lib/backends/url.py
@@ -62,7 +62,11 @@ def _collect(self, files_dir: Path) -> None:
             warc_writer = FileWriter(fh, warc_path, gzip=False, content_type_overrides=self.content_type_overrides)
             with capture_http(warc_writer):
                 warc_writer.custom_out_path = self.output
-                requests.get(self.url, timeout=self.timeout)
+                response = requests.get(self.url, timeout=self.timeout, stream=True)
+                # consume the streaming response so capture_http writes it
+                # see https://github.com/webrecorder/warcio/issues/187 for whether this is necessary
+                for _ in response.iter_content(chunk_size=2**16):
+                    pass
         return {'path': str(warc_writer.result_path)}
 
     def request_dict(self) -> dict: