qiita-spots · antgonza · Jan 21, 2025 · Apr 30, 2024 · May 7, 2024 · Jun 5, 2024
diff --git a/notebooks/resource-allocation/upload_df.py b/notebooks/resource-allocation/upload_df.py
@@ -0,0 +1,72 @@
+import pandas as pd
+
+# Example data loading
+filename = './data/jobs_2024-02-21.tsv.gz'
+df = pd.read_csv(filename, sep='\t', dtype={'extra_info': str})
+
+# Convert string to timedelta, then to total seconds
+df['ElapsedRawTime'] = pd.to_timedelta(
+                                       df['ElapsedRawTime']).apply(
+                                        lambda x: x.total_seconds())
+
+cname = "Validate"
+sname = "Diversity types - alpha_vector"
+df = df[(df.cName == cname) & (df.sName == sname)]
+
+df['samples'] = df['samples'].fillna(0).astype(int)
+df['columns'] = df['columns'].fillna(0).astype(int)
+df['input_size'] = df['input_size'].fillna(0).astype(int)
+df['MaxRSSRaw'] = df['MaxRSSRaw'].fillna(0).astype(int)
+df['ElapsedRawTime'] = df['ElapsedRawTime'].fillna(0).astype(int)
+
+COL_NAME = 'samples * columns'
+df[COL_NAME] = df['samples'] * df['columns']
+columns = ["MaxRSSRaw", "ElapsedRawTime"]
+max_rows = []
+
+for curr in columns:
+    # Get the maximum value for 'curr' within each COL_NAME group
+    max_values = df.groupby(COL_NAME)[curr].transform(max)
+    # Filter rows where the current column's value
+    # is the maximum within its group
+    curr_rows = df[df[curr] == max_values]
+    max_rows.append(curr_rows)
+
+filtered_df = pd.concat(max_rows).drop_duplicates().reset_index(drop=True)
+
+# INSERT INTO qiita.processing_job(processing_job_id, email, command_id,
+# command_parameters, processing_job_status_id)
+# VALUES('ca27ddbc-a678-4b09-8a1d-b65f52f8eb49',
+# '[email protected]', 1, '""'::json, 1);
+
+# INSERT INTO qiita.slurm_resource_allocations(processing_job_id, samples,
+# columns, input_size, extra_info, memory_used, walltime_used)
+# VALUES('ca27ddbc-a678-4b09-8a1d-b65f52f8eb49', 39, 81, 2, 'nan',
+# 327036000, 91);
+
+# processing_job_id    uuid  NOT NULL,
+# samples              integer,
+# columns              integer,
+# input_size           bigint,
+# extra_info           varchar DEFAULT NULL,
+# memory_used          bigint,
+# walltime_used        integer,
+
+res = ""
+
+for index, row in filtered_df.iterrows():
+    res += f"""('{row['QiitaID']}', '[email protected]', 1, '""'::json, 1),\n"""
+res += ";\n"
+res += "Split\n"
+for index, row in filtered_df.iterrows():
+    res += (
+        f"('{row['QiitaID']}', {int(row['samples'])}, "
+        f"{int(row['columns'])}, {int(row['input_size'])}, "
+        f"'{row['extra_info']}', {int(row['MaxRSSRaw'])}, "
+        f"{int(row['ElapsedRawTime'])}),\n"
+    )
+
+res += ";\n"
+
+with open("sql.txt", 'w') as filename:
+    filename.write(res)
diff --git a/qiita_db/meta_util.py b/qiita_db/meta_util.py
@@ -593,7 +593,7 @@ def update_resource_allocation_redis(active=True):
                 if len(df) == 0:
                     continue
 
-                fig, axs = resource_allocation_plot(df, cname, sname, col_name)
+                fig, axs = resource_allocation_plot(df, col_name)
                 titles = [0, 0]
                 images = [0, 0]
 
@@ -605,21 +605,18 @@ def update_resource_allocation_redis(active=True):
                     # only time
                     new_fig = plt.figure()
                     new_ax = new_fig.add_subplot(111)
-
-                    scatter_data = ax.collections[0]
-                    new_ax.scatter(scatter_data.get_offsets()[:, 0],
-                                   scatter_data.get_offsets()[:, 1],
-                                   s=scatter_data.get_sizes(), label="data")
-
                     line = ax.lines[0]
                     new_ax.plot(line.get_xdata(), line.get_ydata(),
                                 linewidth=1, color='orange')
-
-                    if len(ax.collections) > 1:
-                        failure_data = ax.collections[1]
-                        new_ax.scatter(failure_data.get_offsets()[:, 0],
-                                       failure_data.get_offsets()[:, 1],
-                                       color='red', s=3, label="failures")
+                    handles, labels = ax.get_legend_handles_labels()
+                    for handle, label, scatter_data in zip(handles,
+                                                           labels,
+                                                           ax.collections):
+                        color = handle.get_facecolor()
+                        new_ax.scatter(scatter_data.get_offsets()[:, 0],
+                                       scatter_data.get_offsets()[:, 1],
+                                       s=scatter_data.get_sizes(), label=label,
+                                       color=color)
 
                     new_ax.set_xscale('log')
                     new_ax.set_yscale('log')

diff --git a/qiita_db/support_files/patches/93.sql b/qiita_db/support_files/patches/93.sql
@@ -62,3 +62,11 @@ CREATE INDEX IF NOT EXISTS processing_job_command_parameters_payload ON qiita.pr
 -- Addding contraints for the slurm_reservation column
 ALTER TABLE qiita.analysis DROP CONSTRAINT IF EXISTS analysis_slurm_reservation_valid_chars;
 ALTER TABLE qiita.analysis ADD CONSTRAINT analysis_slurm_reservation_valid_chars CHECK ( slurm_reservation ~ '^[a-zA-Z0-9_]*$' );
+
+-- Jan 7, 2025
+-- Adding a table for formulas for resource allocations
+CREATE TABLE qiita.allocation_equations (
+  equation_id     SERIAL PRIMARY KEY,
+  equation_name   TEXT NOT NULL,
+  expression      TEXT NOT NULL
+ );
diff --git a/qiita_db/support_files/patches/test_db_sql/93.sql b/qiita_db/support_files/patches/test_db_sql/93.sql
@@ -0,0 +1,10 @@
+INSERT INTO qiita.allocation_equations(equation_name, expression)
+        VALUES 
+        ('mem_model1', '(k * (np.log(x))) + (x * a) + b'),
+('mem_model2', '(k * (np.log(x))) + (b * ((np.log(x))**2)) + a'),
+('mem_model3', '(k * (np.log(x))) + (b * ((np.log(x))**2)) + (a * ((np.log(x))**3))'),
+('mem_model4', '(k * (np.log(x))) + (b * ((np.log(x))**2)) + (a * ((np.log(x))**2.5))'),
+('time_model1', 'a + b + ((np.log(x)) * k)'),
+('time_model2', 'a + (b * x) + ((np.log(x)) * k)'),
+('time_model3', 'a + (b * ((np.log(x))**2)) + ((np.log(x)) * k)'),
+('time_model4', '(a * ((np.log(x))**3)) + (b * ((np.log(x))**2)) + ((np.log(x)) * k)');
diff --git a/qiita_db/test/test_meta_util.py b/qiita_db/test/test_meta_util.py
@@ -532,13 +532,18 @@ def test_update_resource_allocation_redis(self):
             "model: "
             "k * log(x) + "
             "b * log(x)^2 + "
-            "a * log(x)^3" in title_mem
+            "a * log(x)^2.5" in title_mem
         )
 
         title_time_str = 'resources$#%s$#%s$#%s$#%s:%s' % (
                         cname, sname, version, col_name, 'title_time')
         title_time = str(r_client.get(title_time_str))
-        self.assertTrue("model: a + b + log(x) * k" in title_time)
+        self.assertTrue(
+            "model: "
+            "a * log(x)^3 + "
+            "b * log(x)^2 + "
+            "log(x) * k" in title_time
+        )
 
 
 if __name__ == '__main__':

diff --git a/qiita_db/test/test_util.py b/qiita_db/test/test_util.py
@@ -1327,8 +1327,7 @@ def setUp(self):
 
     def test_plot_return(self):
         # check the plot returns correct objects
-        fig1, axs1 = qdb.util.resource_allocation_plot(
-            self.df, self.cname, self.sname, self.col_name)
+        fig1, axs1 = qdb.util.resource_allocation_plot(self.df, self.col_name)
         self.assertIsInstance(
             fig1, Figure,
             "Returned object fig1 is not a Matplotlib Figure")
@@ -1344,46 +1343,46 @@ def test_minimize_const(self):
         self.df[self.col_name] = self.df.samples * self.df['columns']
         fig, axs = plt.subplots(ncols=2, figsize=(10, 4), sharey=False)
 
-        bm, options = qdb.util._resource_allocation_plot_helper(
-            self.df, axs[0], self.cname, self.sname, 'MaxRSSRaw',
-            qdb.util.MODELS_MEM, self.col_name)
+        mem_models, time_models = qdb.util._retrieve_equations()
+        bm_name, bm, options = qdb.util._resource_allocation_plot_helper(
+            self.df, axs[0], 'MaxRSSRaw', mem_models, self.col_name)
         # check that the algorithm chooses correct model for MaxRSSRaw and
         # has 0 failures
         k, a, b = options.x
-        failures_df = qdb.util._resource_allocation_failures(
-            self.df, k, a, b, bm, self.col_name, 'MaxRSSRaw')
+        failures_df = qdb.util._resource_allocation_success_failures(
+            self.df, k, a, b, bm, self.col_name, 'MaxRSSRaw')[-1]
         failures = failures_df.shape[0]
-        self.assertEqual(bm, qdb.util.mem_model3,
+
+        self.assertEqual(bm_name, 'mem_model4',
+                         msg=f"""Best memory model
+                         doesn't match
+                         {bm_name} != 'mem_model4'""")
+        self.assertEqual(bm, mem_models['mem_model4'],
                          msg=f"""Best memory model
                                  doesn't match
                                  Coefficients:{k} {a} {b}
-                                 {qdb.util.mem_model1}, "qdb.util.mem_model1"
-                                 {qdb.util.mem_model2}, "qdb.util.mem_model2"
-                                 {qdb.util.mem_model3}, "qdb.util.mem_model3"
-                                 {qdb.util.mem_model4}, "qdb.util.mem_model4"
                             """)
         self.assertEqual(failures, 0, "Number of failures must be 0")
 
         # check that the algorithm chooses correct model for ElapsedRaw and
         # has 1 failure
-        bm, options = qdb.util._resource_allocation_plot_helper(
-            self.df, axs[1], self.cname, self.sname, 'ElapsedRaw',
-            qdb.util.MODELS_TIME, self.col_name)
+        bm_name, bm, options = qdb.util._resource_allocation_plot_helper(
+            self.df, axs[1], 'ElapsedRaw', time_models, self.col_name)
         k, a, b = options.x
-        failures_df = qdb.util._resource_allocation_failures(
-            self.df, k, a, b, bm, self.col_name, 'ElapsedRaw')
+        failures_df = qdb.util._resource_allocation_success_failures(
+            self.df, k, a, b, bm, self.col_name, 'ElapsedRaw')[-1]
         failures = failures_df.shape[0]
+        self.assertEqual(bm_name, 'time_model4',
+                         msg=f"""Best time model
+                         doesn't match
+                         {bm_name} != 'time_model4'""")
 
-        self.assertEqual(bm, qdb.util.time_model1,
+        self.assertEqual(bm, time_models[bm_name],
                          msg=f"""Best time model
                                 doesn't match
                                 Coefficients:{k} {a} {b}
-                                 {qdb.util.time_model1}, "qdb.util.time_model1"
-                                 {qdb.util.time_model2}, "qdb.util.time_model2"
-                                 {qdb.util.time_model3}, "qdb.util.time_model3"
-                                 {qdb.util.time_model4}, "qdb.util.time_model4"
                                 """)
-        self.assertEqual(failures, 1, "Number of failures must be 1")
+        self.assertEqual(failures, 0, "Number of failures must be 0")
 
     def test_MaxRSS_helper(self):
         tests = [