Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

set workers based on recommended number of cores #1170

Merged
merged 3 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .run/uvicorn.run.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="uvicorn" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
<module name="clowder2" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
Expand All @@ -14,7 +13,7 @@
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="uvicorn" />
<option name="PARAMETERS" value="app.main:app --reload --host 0.0.0.0" />
<option name="PARAMETERS" value="app.main:app --host 0.0.0.0 --workers 17" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="true" />
Expand Down
3 changes: 2 additions & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ ENV PATH="/code/.venv/bin:$PATH"
COPY ./app /code/app

# launch app using uvicorn
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"]
# Number of recommended workers is 2 x number_of_cores +1
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "17"]
4 changes: 3 additions & 1 deletion backend/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,9 @@ async def startup_beanie():
ThumbnailDBViewList,
LicenseDB,
],
recreate_views=True,
# If view exists, will not recreate
# When view query changes, make sure to manually drop view and recreate
recreate_views=False,
)


Expand Down
135 changes: 98 additions & 37 deletions backend/app/routers/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
from pymongo import DESCENDING
from rocrate.model.person import Person
from rocrate.rocrate import ROCrate
from starlette.concurrency import run_in_threadpool

router = APIRouter()
security = HTTPBearer()
Expand Down Expand Up @@ -1188,16 +1189,26 @@ async def download_dataset(
bag_info_path = os.path.join(current_temp_dir, "bag-info.txt")
tagmanifest_path = os.path.join(current_temp_dir, "tagmanifest-md5.txt")

with open(manifest_path, "w") as f:
pass # Create empty file so no errors later if the dataset is empty

with open(bagit_path, "w") as f:
f.write("Bag-Software-Agent: clowder.ncsa.illinois.edu" + "\n")
f.write("Bagging-Date: " + str(datetime.datetime.now()) + "\n")
await run_in_threadpool(lambda: open(manifest_path, "w").close())
await run_in_threadpool(lambda: open(manifest_path, "w").close())
await run_in_threadpool(
lambda: open(bagit_path, "w").write(
"Bag-Software-Agent: clowder.ncsa.illinois.edu"
+ "\n"
+ "Bagging-Date: "
+ str(datetime.datetime.now())
+ "\n"
)
)

with open(bag_info_path, "w") as f:
f.write("BagIt-Version: 0.97" + "\n")
f.write("Tag-File-Character-Encoding: UTF-8" + "\n")
await run_in_threadpool(
lambda: open(bag_info_path, "w").write(
"BagIt-Version: 0.97"
+ "\n"
+ "Tag-File-Character-Encoding: UTF-8"
+ "\n"
)
)

# Write dataset metadata if found
metadata = await MetadataDB.find(
Expand All @@ -1210,6 +1221,10 @@ async def download_dataset(
metadata_content = json_util.dumps(metadata)
with open(datasetmetadata_path, "w") as f:
f.write(metadata_content)
await run_in_threadpool(
lambda: open(datasetmetadata_path, "w").write(metadata_content)
)

crate.add_file(
datasetmetadata_path,
dest_path="metadata/_dataset_metadata.json",
Expand All @@ -1232,16 +1247,20 @@ async def download_dataset(
hierarchy = await _get_folder_hierarchy(file.folder_id, "")
dest_folder = os.path.join(current_temp_dir, hierarchy.lstrip("/"))
if not os.path.isdir(dest_folder):
os.makedirs(dest_folder, exist_ok=True)
await run_in_threadpool(os.makedirs, dest_folder, exist_ok=True)
file_name = hierarchy + file_name
current_file_path = os.path.join(current_temp_dir, file_name.lstrip("/"))

content = fs.get_object(settings.MINIO_BUCKET_NAME, bytes_file_id)
file_md5_hash = hashlib.md5(content.data).hexdigest()
with open(current_file_path, "wb") as f1:
f1.write(content.data)
with open(manifest_path, "a") as mpf:
mpf.write(file_md5_hash + " " + file_name + "\n")
await run_in_threadpool(
lambda: open(current_file_path, "wb").write(content.data)
)
await run_in_threadpool(
lambda: open(manifest_path, "a").write(
file_md5_hash + " " + file_name + "\n"
)
)
crate.add_file(
current_file_path,
dest_path="data/" + file_name,
Expand All @@ -1262,23 +1281,43 @@ async def download_dataset(
current_temp_dir, metadata_filename
)
metadata_content = json_util.dumps(metadata)
with open(metadata_filename_temp_path, "w") as f:
f.write(metadata_content)
await run_in_threadpool(
lambda: open(metadata_filename_temp_path, "w").write(
metadata_content
)
)
crate.add_file(
metadata_filename_temp_path,
dest_path="metadata/" + metadata_filename,
properties={"name": metadata_filename},
)

bag_size_kb = bag_size / 1024

with open(bagit_path, "a") as f:
f.write("Bag-Size: " + str(bag_size_kb) + " kB" + "\n")
f.write("Payload-Oxum: " + str(bag_size) + "." + str(file_count) + "\n")
f.write("Internal-Sender-Identifier: " + dataset_id + "\n")
f.write("Internal-Sender-Description: " + dataset.description + "\n")
f.write("Contact-Name: " + user_full_name + "\n")
f.write("Contact-Email: " + user.email + "\n")
await run_in_threadpool(
lambda: open(bagit_path, "a").write(
"Bag-Size: "
+ str(bag_size_kb)
+ " kB"
+ "\n"
+ "Payload-Oxum: "
+ str(bag_size)
+ "."
+ str(file_count)
+ "\n"
+ "Internal-Sender-Identifier: "
+ dataset_id
+ "\n"
+ "Internal-Sender-Description: "
+ dataset.description
+ "\n"
+ "Contact-Name: "
+ user_full_name
+ "\n"
+ "Contact-Email: "
+ user.email
+ "\n"
)
)
crate.add_file(
bagit_path, dest_path="bagit.txt", properties={"name": "bagit.txt"}
)
Expand All @@ -1292,14 +1331,33 @@ async def download_dataset(
)

# Generate tag manifest file
manifest_md5_hash = hashlib.md5(open(manifest_path, "rb").read()).hexdigest()
bagit_md5_hash = hashlib.md5(open(bagit_path, "rb").read()).hexdigest()
bag_info_md5_hash = hashlib.md5(open(bag_info_path, "rb").read()).hexdigest()

with open(tagmanifest_path, "w") as f:
f.write(bagit_md5_hash + " " + "bagit.txt" + "\n")
f.write(manifest_md5_hash + " " + "manifest-md5.txt" + "\n")
f.write(bag_info_md5_hash + " " + "bag-info.txt" + "\n")
manifest_md5_hash = await run_in_threadpool(
lambda: hashlib.md5(open(manifest_path, "rb").read()).hexdigest()
)
bagit_md5_hash = await run_in_threadpool(
lambda: hashlib.md5(open(bagit_path, "rb").read()).hexdigest()
)
bag_info_md5_hash = await run_in_threadpool(
lambda: hashlib.md5(open(bag_info_path, "rb").read()).hexdigest()
)

await run_in_threadpool(
lambda: open(tagmanifest_path, "w").write(
bagit_md5_hash
+ " "
+ "bagit.txt"
+ "\n"
+ manifest_md5_hash
+ " "
+ "manifest-md5.txt"
+ "\n"
+ bag_info_md5_hash
+ " "
+ "bag-info.txt"
+ "\n"
)
)

crate.add_file(
tagmanifest_path,
dest_path="tagmanifest-md5.txt",
Expand All @@ -1313,13 +1371,16 @@ async def download_dataset(
)
zip_name = dataset.name + version_name + ".zip"
path_to_zip = os.path.join(current_temp_dir, zip_name)
crate.write_zip(path_to_zip)
f = open(path_to_zip, "rb", buffering=0)
zip_bytes = f.read()

await run_in_threadpool(crate.write_zip, path_to_zip) # takes the most time?

f = await run_in_threadpool(open, path_to_zip, "rb", 0)
zip_bytes = await run_in_threadpool(f.read)
stream = io.BytesIO(zip_bytes)
f.close()
await run_in_threadpool(f.close)

try:
shutil.rmtree(current_temp_dir)
await run_in_threadpool(shutil.rmtree, current_temp_dir)
except Exception as e:
print("could not delete file")
print(e)
Expand Down
Loading