Skip to content

Commit

Permalink
set workers based on recommended number of cores (#1170)
Browse files Browse the repository at this point in the history
* set workers based on recommended number of cores

* turn recreate views to false

* add notes to recreate views flag
  • Loading branch information
longshuicy authored Aug 27, 2024
1 parent 0a64e47 commit 8c9eb8a
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 41 deletions.
3 changes: 1 addition & 2 deletions .run/uvicorn.run.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="uvicorn" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
<module name="clowder2" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
Expand All @@ -14,7 +13,7 @@
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="uvicorn" />
<option name="PARAMETERS" value="app.main:app --reload --host 0.0.0.0" />
<option name="PARAMETERS" value="app.main:app --host 0.0.0.0 --workers 17" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="true" />
Expand Down
3 changes: 2 additions & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ ENV PATH="/code/.venv/bin:$PATH"
COPY ./app /code/app

# launch app using uvicorn
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"]
# Number of recommended workers is 2 x number_of_cores +1
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "17"]
4 changes: 3 additions & 1 deletion backend/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,9 @@ async def startup_beanie():
ThumbnailDBViewList,
LicenseDB,
],
recreate_views=True,
# If view exists, will not recreate
# When view query changes, make sure to manually drop view and recreate
recreate_views=False,
)


Expand Down
135 changes: 98 additions & 37 deletions backend/app/routers/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
from pymongo import DESCENDING
from rocrate.model.person import Person
from rocrate.rocrate import ROCrate
from starlette.concurrency import run_in_threadpool

router = APIRouter()
security = HTTPBearer()
Expand Down Expand Up @@ -1188,16 +1189,26 @@ async def download_dataset(
bag_info_path = os.path.join(current_temp_dir, "bag-info.txt")
tagmanifest_path = os.path.join(current_temp_dir, "tagmanifest-md5.txt")

with open(manifest_path, "w") as f:
pass # Create empty file so no errors later if the dataset is empty

with open(bagit_path, "w") as f:
f.write("Bag-Software-Agent: clowder.ncsa.illinois.edu" + "\n")
f.write("Bagging-Date: " + str(datetime.datetime.now()) + "\n")
await run_in_threadpool(lambda: open(manifest_path, "w").close())
await run_in_threadpool(lambda: open(manifest_path, "w").close())
await run_in_threadpool(
lambda: open(bagit_path, "w").write(
"Bag-Software-Agent: clowder.ncsa.illinois.edu"
+ "\n"
+ "Bagging-Date: "
+ str(datetime.datetime.now())
+ "\n"
)
)

with open(bag_info_path, "w") as f:
f.write("BagIt-Version: 0.97" + "\n")
f.write("Tag-File-Character-Encoding: UTF-8" + "\n")
await run_in_threadpool(
lambda: open(bag_info_path, "w").write(
"BagIt-Version: 0.97"
+ "\n"
+ "Tag-File-Character-Encoding: UTF-8"
+ "\n"
)
)

# Write dataset metadata if found
metadata = await MetadataDB.find(
Expand All @@ -1210,6 +1221,10 @@ async def download_dataset(
metadata_content = json_util.dumps(metadata)
with open(datasetmetadata_path, "w") as f:
f.write(metadata_content)
await run_in_threadpool(
lambda: open(datasetmetadata_path, "w").write(metadata_content)
)

crate.add_file(
datasetmetadata_path,
dest_path="metadata/_dataset_metadata.json",
Expand All @@ -1232,16 +1247,20 @@ async def download_dataset(
hierarchy = await _get_folder_hierarchy(file.folder_id, "")
dest_folder = os.path.join(current_temp_dir, hierarchy.lstrip("/"))
if not os.path.isdir(dest_folder):
os.makedirs(dest_folder, exist_ok=True)
await run_in_threadpool(os.makedirs, dest_folder, exist_ok=True)
file_name = hierarchy + file_name
current_file_path = os.path.join(current_temp_dir, file_name.lstrip("/"))

content = fs.get_object(settings.MINIO_BUCKET_NAME, bytes_file_id)
file_md5_hash = hashlib.md5(content.data).hexdigest()
with open(current_file_path, "wb") as f1:
f1.write(content.data)
with open(manifest_path, "a") as mpf:
mpf.write(file_md5_hash + " " + file_name + "\n")
await run_in_threadpool(
lambda: open(current_file_path, "wb").write(content.data)
)
await run_in_threadpool(
lambda: open(manifest_path, "a").write(
file_md5_hash + " " + file_name + "\n"
)
)
crate.add_file(
current_file_path,
dest_path="data/" + file_name,
Expand All @@ -1262,23 +1281,43 @@ async def download_dataset(
current_temp_dir, metadata_filename
)
metadata_content = json_util.dumps(metadata)
with open(metadata_filename_temp_path, "w") as f:
f.write(metadata_content)
await run_in_threadpool(
lambda: open(metadata_filename_temp_path, "w").write(
metadata_content
)
)
crate.add_file(
metadata_filename_temp_path,
dest_path="metadata/" + metadata_filename,
properties={"name": metadata_filename},
)

bag_size_kb = bag_size / 1024

with open(bagit_path, "a") as f:
f.write("Bag-Size: " + str(bag_size_kb) + " kB" + "\n")
f.write("Payload-Oxum: " + str(bag_size) + "." + str(file_count) + "\n")
f.write("Internal-Sender-Identifier: " + dataset_id + "\n")
f.write("Internal-Sender-Description: " + dataset.description + "\n")
f.write("Contact-Name: " + user_full_name + "\n")
f.write("Contact-Email: " + user.email + "\n")
await run_in_threadpool(
lambda: open(bagit_path, "a").write(
"Bag-Size: "
+ str(bag_size_kb)
+ " kB"
+ "\n"
+ "Payload-Oxum: "
+ str(bag_size)
+ "."
+ str(file_count)
+ "\n"
+ "Internal-Sender-Identifier: "
+ dataset_id
+ "\n"
+ "Internal-Sender-Description: "
+ dataset.description
+ "\n"
+ "Contact-Name: "
+ user_full_name
+ "\n"
+ "Contact-Email: "
+ user.email
+ "\n"
)
)
crate.add_file(
bagit_path, dest_path="bagit.txt", properties={"name": "bagit.txt"}
)
Expand All @@ -1292,14 +1331,33 @@ async def download_dataset(
)

# Generate tag manifest file
manifest_md5_hash = hashlib.md5(open(manifest_path, "rb").read()).hexdigest()
bagit_md5_hash = hashlib.md5(open(bagit_path, "rb").read()).hexdigest()
bag_info_md5_hash = hashlib.md5(open(bag_info_path, "rb").read()).hexdigest()

with open(tagmanifest_path, "w") as f:
f.write(bagit_md5_hash + " " + "bagit.txt" + "\n")
f.write(manifest_md5_hash + " " + "manifest-md5.txt" + "\n")
f.write(bag_info_md5_hash + " " + "bag-info.txt" + "\n")
manifest_md5_hash = await run_in_threadpool(
lambda: hashlib.md5(open(manifest_path, "rb").read()).hexdigest()
)
bagit_md5_hash = await run_in_threadpool(
lambda: hashlib.md5(open(bagit_path, "rb").read()).hexdigest()
)
bag_info_md5_hash = await run_in_threadpool(
lambda: hashlib.md5(open(bag_info_path, "rb").read()).hexdigest()
)

await run_in_threadpool(
lambda: open(tagmanifest_path, "w").write(
bagit_md5_hash
+ " "
+ "bagit.txt"
+ "\n"
+ manifest_md5_hash
+ " "
+ "manifest-md5.txt"
+ "\n"
+ bag_info_md5_hash
+ " "
+ "bag-info.txt"
+ "\n"
)
)

crate.add_file(
tagmanifest_path,
dest_path="tagmanifest-md5.txt",
Expand All @@ -1313,13 +1371,16 @@ async def download_dataset(
)
zip_name = dataset.name + version_name + ".zip"
path_to_zip = os.path.join(current_temp_dir, zip_name)
crate.write_zip(path_to_zip)
f = open(path_to_zip, "rb", buffering=0)
zip_bytes = f.read()

await run_in_threadpool(crate.write_zip, path_to_zip) # takes the most time?

f = await run_in_threadpool(open, path_to_zip, "rb", 0)
zip_bytes = await run_in_threadpool(f.read)
stream = io.BytesIO(zip_bytes)
f.close()
await run_in_threadpool(f.close)

try:
shutil.rmtree(current_temp_dir)
await run_in_threadpool(shutil.rmtree, current_temp_dir)
except Exception as e:
print("could not delete file")
print(e)
Expand Down

0 comments on commit 8c9eb8a

Please sign in to comment.