Skip to content

Commit

Permalink
Use requests streaming mode to avoid loading large URLs into memory
Browse files Browse the repository at this point in the history
  • Loading branch information
jcushman committed Jan 16, 2025
1 parent da35002 commit f1fd733
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 2 deletions.
26 changes: 26 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Changelog

## [Unreleased]

### Added
-

### Changed
-

### Deprecated
-

### Removed
-

### Fixed
-

### Security
-

## [0.1.2] - 2025-01-16

### Fixed
- Use requests streaming mode to avoid loading large URLs into memory
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "nabit"
version = "0.1.1"
version = "0.1.2"
description = "Archive and sign datasets"
readme = "README.md"
requires-python = ">=3.11"
Expand Down
6 changes: 5 additions & 1 deletion src/nabit/lib/backends/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ def _collect(self, files_dir: Path) -> None:
warc_writer = FileWriter(fh, warc_path, gzip=False, content_type_overrides=self.content_type_overrides)
with capture_http(warc_writer):
warc_writer.custom_out_path = self.output
requests.get(self.url, timeout=self.timeout)
response = requests.get(self.url, timeout=self.timeout, stream=True)
# consume the streaming response so capture_http writes it
# see https://github.com/webrecorder/warcio/issues/187 for whether this is necessary
for _ in response.iter_content(chunk_size=2**16):
pass
return {'path': str(warc_writer.result_path)}

def request_dict(self) -> dict:
Expand Down

0 comments on commit f1fd733

Please sign in to comment.