-
Notifications
You must be signed in to change notification settings - Fork 23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Auto-download files from the staging directory to output #500
base: main
Are you sure you want to change the base?
Changes from all commits
25e17a5
3fc2cc0
7abdf27
56f095d
ba44f7c
381dcae
a723337
92a4574
bc4bc88
551561e
1cb9c84
4c2b558
a6b75ed
9d41103
88e7b0b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
from multiprocessing import Queue | ||
from typing import List, Optional | ||
|
||
from jupyter_scheduler.models import DescribeDownload | ||
from jupyter_scheduler.orm import Download, create_session, generate_uuid | ||
from jupyter_scheduler.pydantic_v1 import BaseModel | ||
from jupyter_scheduler.utils import get_utc_timestamp | ||
|
||
|
||
def initiate_download_standalone( | ||
job_id: str, download_queue: Queue, db_session, redownload: bool = False | ||
): | ||
""" | ||
This method initiates a download in a standalone manner independent of the DownloadManager instance. It is suitable for use in multiprocessing environment where a direct reference to DownloadManager instance is not feasible. | ||
""" | ||
download_initiated_time = get_utc_timestamp() | ||
download_id = generate_uuid() | ||
download = DescribeDownload( | ||
job_id=job_id, | ||
download_id=download_id, | ||
download_initiated_time=download_initiated_time, | ||
redownload=redownload, | ||
) | ||
download_record = Download(**download.dict()) | ||
db_session.add(download_record) | ||
db_session.commit() | ||
download_queue.put(download) | ||
|
||
|
||
class DownloadRecordManager: | ||
def __init__(self, db_url): | ||
self.session = create_session(db_url) | ||
|
||
def put(self, download: DescribeDownload): | ||
with self.session() as session: | ||
download = Download(**download.dict()) | ||
session.add(download) | ||
session.commit() | ||
|
||
def get(self, download_id: str) -> Optional[DescribeDownload]: | ||
with self.session() as session: | ||
download = session.query(Download).filter(Download.download_id == download_id).first() | ||
|
||
if download: | ||
return DescribeDownload.from_orm(download) | ||
else: | ||
return None | ||
|
||
def get_downloads(self) -> List[DescribeDownload]: | ||
with self.session() as session: | ||
return session.query(Download).order_by(Download.download_initiated_time).all() | ||
|
||
def delete_download(self, download_id: str): | ||
with self.session() as session: | ||
session.query(Download).filter(Download.download_id == download_id).delete() | ||
session.commit() | ||
|
||
def delete_job_downloads(self, job_id: str): | ||
with self.session() as session: | ||
session.query(Download).filter(Download.job_id == job_id).delete() | ||
session.commit() | ||
|
||
|
||
class DownloadManager: | ||
def __init__(self, db_url: str): | ||
self.record_manager = DownloadRecordManager(db_url=db_url) | ||
self.queue = Queue() | ||
|
||
def initiate_download(self, job_id: str, redownload: bool): | ||
with self.record_manager.session() as session: | ||
initiate_download_standalone( | ||
job_id=job_id, download_queue=self.queue, db_session=session, redownload=redownload | ||
) | ||
|
||
def delete_download(self, download_id: str): | ||
self.record_manager.delete_download(download_id) | ||
|
||
def delete_job_downloads(self, job_id: str): | ||
self.record_manager.delete_job_downloads(job_id) | ||
|
||
def populate_queue(self): | ||
downloads = self.record_manager.get_downloads() | ||
for download in downloads: | ||
self.queue.put(download) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import asyncio | ||
|
||
import traitlets | ||
from jupyter_server.transutils import _i18n | ||
from traitlets.config import LoggingConfigurable | ||
|
||
from jupyter_scheduler.download_manager import DownloadManager | ||
from jupyter_scheduler.job_files_manager import JobFilesManager | ||
|
||
|
||
class BaseDownloadRunner(LoggingConfigurable): | ||
"""Base download runner, this class's start method is called | ||
at the start of jupyter server, and is responsible for | ||
polling for downloads to download. | ||
""" | ||
|
||
def __init__(self, config=None, **kwargs): | ||
super().__init__(config=config) | ||
|
||
downloads_poll_interval = traitlets.Integer( | ||
default_value=3, | ||
config=True, | ||
help=_i18n( | ||
"The interval in seconds that the download runner polls for downloads to download." | ||
), | ||
) | ||
|
||
def start(self): | ||
raise NotImplementedError("Must be implemented by subclass") | ||
|
||
|
||
class DownloadRunner(BaseDownloadRunner): | ||
"""Default download runner that maintains a record and a queue of initiated downloads , and polls the queue every `poll_interval` seconds | ||
for downloads to download. | ||
""" | ||
|
||
def __init__( | ||
self, download_manager: DownloadManager, job_files_manager: JobFilesManager, config=None | ||
): | ||
super().__init__(config=config) | ||
self.download_manager = download_manager | ||
self.job_files_manager = job_files_manager | ||
|
||
async def process_download_queue(self): | ||
while not self.download_manager.queue.empty(): | ||
download = self.download_manager.queue.get() | ||
download_record = self.download_manager.record_manager.get(download.download_id) | ||
if not download_record: | ||
continue | ||
await self.job_files_manager.copy_from_staging(download.job_id, download.redownload) | ||
self.download_manager.delete_download(download.download_id) | ||
Comment on lines
+44
to
+51
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can avoid using I believe that this may fix the process bug previously raised on the E2E tests in this branch. This is the corresponding error message:
If we remove the need for multiprocessing objects, we may be able to fix this bug without relying on |
||
|
||
async def start(self): | ||
self.download_manager.populate_queue() | ||
while True: | ||
await self.process_download_queue() | ||
await asyncio.sleep(self.downloads_poll_interval) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,13 @@ | ||
import asyncio | ||
import multiprocessing | ||
|
||
from jupyter_core.paths import jupyter_data_dir | ||
from jupyter_server.extension.application import ExtensionApp | ||
from jupyter_server.transutils import _i18n | ||
from traitlets import Bool, Type, Unicode, default | ||
|
||
from jupyter_scheduler.download_manager import DownloadManager | ||
from jupyter_scheduler.download_runner import DownloadRunner | ||
from jupyter_scheduler.orm import create_tables | ||
|
||
from .handlers import ( | ||
|
@@ -67,27 +70,48 @@ def _db_url_default(self): | |
) | ||
|
||
def initialize_settings(self): | ||
# Forces new processes to not be forked on Linux. | ||
# This is necessary because `asyncio.get_event_loop()` is bugged in | ||
# forked processes in Python versions below 3.12. This method is | ||
# called by `jupyter_core` by `nbconvert` in the default executor. | ||
|
||
# See: https://github.com/python/cpython/issues/66285 | ||
# See also: https://github.com/jupyter/jupyter_core/pull/362 | ||
multiprocessing.set_start_method("spawn", force=True) | ||
|
||
Comment on lines
+73
to
+81
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem with this is that this line affects the
I don't have a solution for how this bug can be fixed. However, the error message is pretty specific about why an exception is being raised, so my intuition is that this bug can be fixed. I'm leaving some references here for us to review in the future.
|
||
super().initialize_settings() | ||
|
||
create_tables(self.db_url, self.drop_tables) | ||
|
||
environments_manager = self.environment_manager_class() | ||
|
||
download_manager = DownloadManager(db_url=self.db_url) | ||
|
||
scheduler = self.scheduler_class( | ||
root_dir=self.serverapp.root_dir, | ||
environments_manager=environments_manager, | ||
db_url=self.db_url, | ||
download_manager=download_manager, | ||
config=self.config, | ||
) | ||
|
||
job_files_manager = self.job_files_manager_class(scheduler=scheduler) | ||
|
||
download_runner = DownloadRunner( | ||
download_manager=download_manager, job_files_manager=job_files_manager | ||
) | ||
|
||
self.settings.update( | ||
environments_manager=environments_manager, | ||
scheduler=scheduler, | ||
job_files_manager=job_files_manager, | ||
initiate_download=download_manager.initiate_download, | ||
) | ||
|
||
if scheduler.task_runner: | ||
loop = asyncio.get_event_loop() | ||
loop.create_task(scheduler.task_runner.start()) | ||
|
||
if download_runner: | ||
loop = asyncio.get_event_loop() | ||
loop.create_task(download_runner.start()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we merge this class with
DownloadRecordManager
above? I don't see the benefit of splitting the logic here into two separate classes if they are only used together anyways.