diff --git a/doc/tutorials/customize-opt.ipynb b/doc/tutorials/customize-opt.ipynb index f7f58bcb..71c9f742 100644 --- a/doc/tutorials/customize-opt.ipynb +++ b/doc/tutorials/customize-opt.ipynb @@ -13,7 +13,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "```{contents} Table of Contents\n", + "```{contents} 目录\n", ":local:\n", ":depth: 1\n", "```" @@ -204,3334 +204,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:28:05 [INFO] Logging directory: /tmp/tmp081zz3q0/logs\n", - "2024-09-05 13:28:24 [INFO] LocalBuilder: max_workers = 24\n", - "2024-09-05 13:28:25 [INFO] LocalRunner: max_workers = 1\n", - "2024-09-05 13:28:27 [INFO] [task_scheduler.cc:159] Initializing Task #0: \"main\"\n", - "2024-09-05 13:28:27 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:28:27 [INFO] [task_scheduler.cc:193] Sending 6 sample(s) to builder\n", - "2024-09-05 13:28:30 [INFO] [task_scheduler.cc:195] Sending 6 sample(s) to runner\n", - "2024-09-05 13:28:33 [DEBUG] XGB iter 0: tr-p-rmse: 0.276184\ttr-a-peak@32: 0.929974\ttr-rmse: 0.285904\ttr-rmse: 0.285904\n", - "2024-09-05 13:28:34 [DEBUG] XGB iter 25: tr-p-rmse: 0.019389\ttr-a-peak@32: 1.000000\ttr-rmse: 0.021202\ttr-rmse: 0.021202\n", - "2024-09-05 13:28:34 [DEBUG] XGB iter 50: tr-p-rmse: 0.016852\ttr-a-peak@32: 1.000000\ttr-rmse: 0.017945\ttr-rmse: 0.017945\n", - "2024-09-05 13:28:34 [DEBUG] XGB iter 75: tr-p-rmse: 0.016863\ttr-a-peak@32: 1.000000\ttr-rmse: 0.017944\ttr-rmse: 0.017944\n", - "2024-09-05 13:28:34 [DEBUG] XGB stopped. Best iteration: [38] tr-p-rmse:0.01679\ttr-a-peak@32:1.00000\ttr-rmse:0.01798\ttr-rmse:0.01798 \n", - "2024-09-05 13:28:34 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main110.00033.33283.33286
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 1 1 0.0003 3.3328 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.3328 6 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:28:34 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 1 | 1 | 0.0003 | 3.3328 | 3.3328 | 6 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 6\n", - "Total latency (us): 3.33276\n", - "\n", - "\n", - "Total trials: 6\n", - "Total latency (us): 3.33276\n", - "\n", - "2024-09-05 13:28:34 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:28:34 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder\n", - "2024-09-05 13:28:34 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner\n", - "2024-09-05 13:28:34 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main110.00033.33283.33286
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 1 1 0.0003 3.3328 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.3328 6 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:28:34 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 1 | 1 | 0.0003 | 3.3328 | 3.3328 | 6 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 6\n", - "Total latency (us): 3.33276\n", - "\n", - "\n", - "Total trials: 6\n", - "Total latency (us): 3.33276\n", - "\n", - "2024-09-05 13:28:34 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:28:35 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder\n", - "2024-09-05 13:28:35 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner\n", - "2024-09-05 13:28:35 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main110.00033.33283.33286
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 1 1 0.0003 3.3328 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.3328 6 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:28:35 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 1 | 1 | 0.0003 | 3.3328 | 3.3328 | 6 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 6\n", - "Total latency (us): 3.33276\n", - "\n", - "\n", - "Total trials: 6\n", - "Total latency (us): 3.33276\n", - "\n", - "2024-09-05 13:28:35 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:28:35 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder\n", - "2024-09-05 13:28:35 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner\n", - "2024-09-05 13:28:35 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main110.00033.33283.33286
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 1 1 0.0003 3.3328 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.3328 6 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:28:35 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 1 | 1 | 0.0003 | 3.3328 | 3.3328 | 6 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 6\n", - "Total latency (us): 3.33276\n", - "\n", - "\n", - "Total trials: 6\n", - "Total latency (us): 3.33276\n", - "\n", - "2024-09-05 13:28:35 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:28:36 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder\n", - "2024-09-05 13:28:36 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner\n", - "2024-09-05 13:28:36 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main110.00033.33283.33286
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 1 1 0.0003 3.3328 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.3328 6 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:28:36 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 1 | 1 | 0.0003 | 3.3328 | 3.3328 | 6 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 6\n", - "Total latency (us): 3.33276\n", - "\n", - "\n", - "Total trials: 6\n", - "Total latency (us): 3.33276\n", - "\n", - "2024-09-05 13:28:36 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:28:36 [INFO] [task_scheduler.cc:260] Task #0 has finished. Remaining task(s): 0\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main110.00033.33283.33286Y
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 1 1 0.0003 3.3328 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.3328 6 Y " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:28:36 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 1 | 1 | 0.0003 | 3.3328 | 3.3328 | 6 | Y \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 6\n", - "Total latency (us): 3.33276\n", - "\n", - "\n", - "Total trials: 6\n", - "Total latency (us): 3.33276\n", - "\n", - "2024-09-05 13:28:37 [INFO] Logging directory: /tmp/tmp081zz3q0/logs\n", - "2024-09-05 13:28:37 [INFO] LocalBuilder: max_workers = 24\n", - "2024-09-05 13:28:38 [INFO] LocalRunner: max_workers = 1\n", - "2024-09-05 13:28:39 [INFO] [task_scheduler.cc:159] Initializing Task #0: \"main\"\n", - "2024-09-05 13:28:39 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:28:46 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:29:03 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:29:18 [DEBUG] XGB iter 0: tr-p-rmse: 0.486828\ttr-a-peak@32: 0.973169\ttr-rmse: 0.337774\ttr-rmse: 0.337774\n", - "2024-09-05 13:29:18 [DEBUG] XGB iter 25: tr-p-rmse: 0.033888\ttr-a-peak@32: 1.000000\ttr-rmse: 0.379580\ttr-rmse: 0.379580\n", - "2024-09-05 13:29:19 [DEBUG] XGB iter 50: tr-p-rmse: 0.033888\ttr-a-peak@32: 1.000000\ttr-rmse: 0.379580\ttr-rmse: 0.379580\n", - "2024-09-05 13:29:19 [DEBUG] XGB stopped. Best iteration: [10] tr-p-rmse:0.03389\ttr-a-peak@32:1.00000\ttr-rmse:0.37958\ttr-rmse:0.37958 \n", - "2024-09-05 13:29:19 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.47933.46113.461164
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.4793 3.4611 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.4611 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:29:19 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.4793 | 3.4611 | 3.4611 | 64 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 64\n", - "Total latency (us): 3.46107\n", - "\n", - "\n", - "Total trials: 64\n", - "Total latency (us): 3.46107\n", - "\n", - "2024-09-05 13:29:19 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:29:27 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:29:45 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:30:06 [DEBUG] XGB validation: p-rmse: 0.146614\ta-peak@32: 0.961599\n", - "2024-09-05 13:30:06 [DEBUG] XGB iter 0: tr-p-rmse: 0.468574\ttr-a-peak@32: 1.000000\ttr-rmse: 0.360456\ttr-rmse: 0.360456\n", - "2024-09-05 13:30:06 [DEBUG] XGB iter 25: tr-p-rmse: 0.033107\ttr-a-peak@32: 1.000000\ttr-rmse: 0.403150\ttr-rmse: 0.403150\n", - "2024-09-05 13:30:06 [DEBUG] XGB iter 50: tr-p-rmse: 0.033107\ttr-a-peak@32: 1.000000\ttr-rmse: 0.403150\ttr-rmse: 0.403150\n", - "2024-09-05 13:30:06 [DEBUG] XGB stopped. Best iteration: [10] tr-p-rmse:0.03311\ttr-a-peak@32:1.00000\ttr-rmse:0.40315\ttr-rmse:0.40315 \n", - "2024-09-05 13:30:06 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.52693.35333.3533128
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.5269 3.3533 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.3533 128 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:30:06 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.5269 | 3.3533 | 3.3533 | 128 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 128\n", - "Total latency (us): 3.3533\n", - "\n", - "\n", - "Total trials: 128\n", - "Total latency (us): 3.3533\n", - "\n", - "2024-09-05 13:30:06 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:30:15 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:30:29 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:30:47 [DEBUG] XGB validation: p-rmse: 0.406259\ta-peak@32: 0.745067\n", - "2024-09-05 13:30:47 [DEBUG] XGB iter 0: tr-p-rmse: 0.433241\ttr-a-peak@32: 0.780321\ttr-rmse: 0.385123\ttr-rmse: 0.385123\n", - "2024-09-05 13:30:47 [DEBUG] XGB iter 25: tr-p-rmse: 0.060232\ttr-a-peak@32: 0.960737\ttr-rmse: 0.437616\ttr-rmse: 0.437616\n", - "2024-09-05 13:30:47 [DEBUG] XGB iter 50: tr-p-rmse: 0.060232\ttr-a-peak@32: 0.960737\ttr-rmse: 0.437616\ttr-rmse: 0.437616\n", - "2024-09-05 13:30:47 [DEBUG] XGB stopped. Best iteration: [15] tr-p-rmse:0.06023\ttr-a-peak@32:0.96074\ttr-rmse:0.43762\ttr-rmse:0.43762 \n", - "2024-09-05 13:30:47 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.52693.35333.3533192
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.5269 3.3533 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.3533 192 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:30:47 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.5269 | 3.3533 | 3.3533 | 192 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 192\n", - "Total latency (us): 3.3533\n", - "\n", - "\n", - "Total trials: 192\n", - "Total latency (us): 3.3533\n", - "\n", - "2024-09-05 13:30:47 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:30:56 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:31:05 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:31:23 [DEBUG] XGB validation: p-rmse: 0.145506\ta-peak@32: 0.972814\n", - "2024-09-05 13:31:23 [DEBUG] XGB iter 0: tr-p-rmse: 0.382378\ttr-a-peak@32: 0.750108\ttr-rmse: 0.425792\ttr-rmse: 0.425792\n", - "2024-09-05 13:31:23 [DEBUG] XGB iter 25: tr-p-rmse: 0.062052\ttr-a-peak@32: 0.959037\ttr-rmse: 0.482740\ttr-rmse: 0.482740\n", - "2024-09-05 13:31:23 [DEBUG] XGB iter 50: tr-p-rmse: 0.062052\ttr-a-peak@32: 0.959037\ttr-rmse: 0.482740\ttr-rmse: 0.482740\n", - "2024-09-05 13:31:23 [DEBUG] XGB stopped. Best iteration: [14] tr-p-rmse:0.06205\ttr-a-peak@32:0.95904\ttr-rmse:0.48274\ttr-rmse:0.48274 \n", - "2024-09-05 13:31:23 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.52693.35333.3533256
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.5269 3.3533 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.3533 256 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 256\n", - "Total latency (us): 3.3533\n", - "\n", - "2024-09-05 13:31:23 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.5269 | 3.3533 | 3.3533 | 256 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 256\n", - "Total latency (us): 3.3533\n", - "\n", - "2024-09-05 13:31:23 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:31:34 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:31:43 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:32:02 [DEBUG] XGB validation: p-rmse: 0.278234\ta-peak@32: 0.877145\n", - "2024-09-05 13:32:02 [DEBUG] XGB iter 0: tr-p-rmse: 0.386500\ttr-a-peak@32: 0.704138\ttr-rmse: 0.422897\ttr-rmse: 0.422897\n", - "2024-09-05 13:32:06 [DEBUG] XGB iter 25: tr-p-rmse: 0.090489\ttr-a-peak@32: 0.975444\ttr-rmse: 0.480953\ttr-rmse: 0.480953\n", - "2024-09-05 13:32:06 [DEBUG] XGB iter 50: tr-p-rmse: 0.090489\ttr-a-peak@32: 0.975444\ttr-rmse: 0.480953\ttr-rmse: 0.480953\n", - "2024-09-05 13:32:06 [DEBUG] XGB stopped. Best iteration: [21] tr-p-rmse:0.09049\ttr-a-peak@32:0.97544\ttr-rmse:0.48095\ttr-rmse:0.48095 \n", - "2024-09-05 13:32:06 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.57873.24313.2431320
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.5787 3.2431 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.2431 320 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:32:06 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.5787 | 3.2431 | 3.2431 | 320 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 320\n", - "Total latency (us): 3.24308\n", - "\n", - "\n", - "Total trials: 320\n", - "Total latency (us): 3.24308\n", - "\n", - "2024-09-05 13:32:06 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:32:16 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:32:31 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:32:49 [DEBUG] XGB validation: p-rmse: 0.173908\ta-peak@32: 0.982168\n", - "2024-09-05 13:32:49 [DEBUG] XGB iter 0: tr-p-rmse: 0.379693\ttr-a-peak@32: 0.783704\ttr-rmse: 0.445377\ttr-rmse: 0.445377\n", - "2024-09-05 13:32:49 [DEBUG] XGB iter 25: tr-p-rmse: 0.092154\ttr-a-peak@32: 0.995751\ttr-rmse: 0.503128\ttr-rmse: 0.503128\n", - "2024-09-05 13:32:50 [DEBUG] XGB iter 50: tr-p-rmse: 0.092154\ttr-a-peak@32: 0.995751\ttr-rmse: 0.503128\ttr-rmse: 0.503128\n", - "2024-09-05 13:32:50 [DEBUG] XGB stopped. Best iteration: [16] tr-p-rmse:0.09215\ttr-a-peak@32:0.99575\ttr-rmse:0.50313\ttr-rmse:0.50313 \n", - "2024-09-05 13:32:50 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.59263.21493.2149384
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.5926 3.2149 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.2149 384 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:32:50 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.5926 | 3.2149 | 3.2149 | 384 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 384\n", - "Total latency (us): 3.21488\n", - "\n", - "\n", - "Total trials: 384\n", - "Total latency (us): 3.21488\n", - "\n", - "2024-09-05 13:32:50 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:33:00 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:33:28 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:33:46 [DEBUG] XGB validation: p-rmse: 0.110631\ta-peak@32: 0.870172\n", - "2024-09-05 13:33:46 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.59273.21473.2147448
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.5927 3.2147 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.2147 448 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 448\n", - "Total latency (us): 3.21466\n", - "\n", - "2024-09-05 13:33:46 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.5927 | 3.2147 | 3.2147 | 448 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 448\n", - "Total latency (us): 3.21466\n", - "\n", - "2024-09-05 13:33:46 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:33:55 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:34:05 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:34:24 [DEBUG] XGB validation: p-rmse: 0.147803\ta-peak@32: 0.907423\n", - "2024-09-05 13:34:24 [DEBUG] XGB iter 0: tr-p-rmse: 0.361584\ttr-a-peak@32: 0.833048\ttr-rmse: 0.471870\ttr-rmse: 0.471870\n", - "2024-09-05 13:34:24 [DEBUG] XGB iter 25: tr-p-rmse: 0.093739\ttr-a-peak@32: 1.000000\ttr-rmse: 0.529987\ttr-rmse: 0.529987\n", - "2024-09-05 13:34:24 [DEBUG] XGB iter 50: tr-p-rmse: 0.093739\ttr-a-peak@32: 1.000000\ttr-rmse: 0.529987\ttr-rmse: 0.529987\n", - "2024-09-05 13:34:24 [DEBUG] XGB stopped. Best iteration: [17] tr-p-rmse:0.09374\ttr-a-peak@32:1.00000\ttr-rmse:0.52999\ttr-rmse:0.52999 \n", - "2024-09-05 13:34:24 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.59273.21473.2147512
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.5927 3.2147 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.2147 512 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:34:24 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.5927 | 3.2147 | 3.2147 | 512 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 512\n", - "Total latency (us): 3.21466\n", - "\n", - "\n", - "Total trials: 512\n", - "Total latency (us): 3.21466\n", - "\n", - "2024-09-05 13:34:24 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:34:38 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:34:46 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:35:03 [DEBUG] XGB validation: p-rmse: 0.155478\ta-peak@32: 0.970850\n", - "2024-09-05 13:35:03 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.66443.07613.0761576
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.6644 3.0761 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.0761 576 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:35:03 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.6644 | 3.0761 | 3.0761 | 576 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 576\n", - "Total latency (us): 3.07609\n", - "\n", - "\n", - "Total trials: 576\n", - "Total latency (us): 3.07609\n", - "\n", - "2024-09-05 13:35:03 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:35:16 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:35:34 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:35:52 [DEBUG] XGB validation: p-rmse: 0.202839\ta-peak@32: 0.860199\n", - "2024-09-05 13:35:52 [DEBUG] XGB iter 0: tr-p-rmse: 0.362110\ttr-a-peak@32: 0.916547\ttr-rmse: 0.472539\ttr-rmse: 0.472539\n", - "2024-09-05 13:35:56 [DEBUG] XGB iter 25: tr-p-rmse: 0.102745\ttr-a-peak@32: 0.998645\ttr-rmse: 0.530408\ttr-rmse: 0.530408\n", - "2024-09-05 13:35:56 [DEBUG] XGB iter 50: tr-p-rmse: 0.102745\ttr-a-peak@32: 0.998645\ttr-rmse: 0.530408\ttr-rmse: 0.530408\n", - "2024-09-05 13:35:56 [DEBUG] XGB stopped. Best iteration: [19] tr-p-rmse:0.10275\ttr-a-peak@32:0.99865\ttr-rmse:0.53041\ttr-rmse:0.53041 \n", - "2024-09-05 13:35:56 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.66443.07613.0761640
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.6644 3.0761 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.0761 640 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 640\n", - "Total latency (us): 3.07609\n", - "\n", - "2024-09-05 13:35:56 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.6644 | 3.0761 | 3.0761 | 640 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 640\n", - "Total latency (us): 3.07609\n", - "\n", - "2024-09-05 13:35:56 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:36:08 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:36:18 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:36:35 [DEBUG] XGB validation: p-rmse: 0.107495\ta-peak@32: 0.985335\n", - "2024-09-05 13:36:35 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.66443.07613.0761704
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.6644 3.0761 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.0761 704 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:36:35 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.6644 | 3.0761 | 3.0761 | 704 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 704\n", - "Total latency (us): 3.07609\n", - "\n", - "\n", - "Total trials: 704\n", - "Total latency (us): 3.07609\n", - "\n", - "2024-09-05 13:36:35 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:36:45 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:36:58 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:37:16 [DEBUG] XGB validation: p-rmse: 0.147194\ta-peak@32: 0.990578\n", - "2024-09-05 13:37:16 [DEBUG] XGB iter 0: tr-p-rmse: 0.358715\ttr-a-peak@32: 0.879123\ttr-rmse: 0.503912\ttr-rmse: 0.503912\n", - "2024-09-05 13:37:16 [DEBUG] XGB iter 25: tr-p-rmse: 0.103183\ttr-a-peak@32: 0.998147\ttr-rmse: 0.559628\ttr-rmse: 0.559628\n", - "2024-09-05 13:37:16 [DEBUG] XGB iter 50: tr-p-rmse: 0.103183\ttr-a-peak@32: 0.998147\ttr-rmse: 0.559628\ttr-rmse: 0.559628\n", - "2024-09-05 13:37:17 [DEBUG] XGB stopped. Best iteration: [19] tr-p-rmse:0.10318\ttr-a-peak@32:0.99815\ttr-rmse:0.55963\ttr-rmse:0.55963 \n", - "2024-09-05 13:37:17 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.66443.07613.0761768
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.6644 3.0761 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.0761 768 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 768\n", - "Total latency (us): 3.07609\n", - "\n", - "2024-09-05 13:37:17 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.6644 | 3.0761 | 3.0761 | 768 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 768\n", - "Total latency (us): 3.07609\n", - "\n", - "2024-09-05 13:37:17 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:37:25 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:37:35 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:37:54 [DEBUG] XGB validation: p-rmse: 0.197366\ta-peak@32: 0.992307\n", - "2024-09-05 13:37:54 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.69383.02283.0228832
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.6938 3.0228 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.0228 832 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:37:54 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.6938 | 3.0228 | 3.0228 | 832 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 832\n", - "Total latency (us): 3.02283\n", - "\n", - "\n", - "Total trials: 832\n", - "Total latency (us): 3.02283\n", - "\n", - "2024-09-05 13:37:54 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:38:04 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:38:13 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:38:31 [DEBUG] XGB validation: p-rmse: 0.274855\ta-peak@32: 0.843743\n", - "2024-09-05 13:38:31 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.69383.02283.0228896
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.6938 3.0228 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 3.0228 896 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:38:31 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.6938 | 3.0228 | 3.0228 | 896 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 896\n", - "Total latency (us): 3.02283\n", - "\n", - "\n", - "Total trials: 896\n", - "Total latency (us): 3.02283\n", - "\n", - "2024-09-05 13:38:31 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:38:42 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:38:51 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:39:08 [DEBUG] XGB validation: p-rmse: 0.092897\ta-peak@32: 0.985491\n", - "2024-09-05 13:39:08 [DEBUG] XGB iter 0: tr-p-rmse: 0.354443\ttr-a-peak@32: 0.872333\ttr-rmse: 0.485428\ttr-rmse: 0.485428\n", - "2024-09-05 13:39:09 [DEBUG] XGB iter 25: tr-p-rmse: 0.111097\ttr-a-peak@32: 0.971528\ttr-rmse: 0.543389\ttr-rmse: 0.543389\n", - "2024-09-05 13:39:09 [DEBUG] XGB iter 50: tr-p-rmse: 0.111097\ttr-a-peak@32: 0.971528\ttr-rmse: 0.543389\ttr-rmse: 0.543389\n", - "2024-09-05 13:39:09 [DEBUG] XGB stopped. Best iteration: [17] tr-p-rmse:0.11110\ttr-a-peak@32:0.97153\ttr-rmse:0.54339\ttr-rmse:0.54339 \n", - "2024-09-05 13:39:09 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.9400960
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 960 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:39:09 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 960 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 960\n", - "Total latency (us): 2.94001\n", - "\n", - "\n", - "Total trials: 960\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:39:09 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:39:18 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:39:29 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:39:48 [DEBUG] XGB validation: p-rmse: 0.242859\ta-peak@32: 0.626766\n", - "2024-09-05 13:39:48 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.94001024
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 1024 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:39:48 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 1024 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1024\n", - "Total latency (us): 2.94001\n", - "\n", - "\n", - "Total trials: 1024\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:39:48 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:39:57 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:40:08 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:40:28 [DEBUG] XGB validation: p-rmse: 0.106012\ta-peak@32: 0.937584\n", - "2024-09-05 13:40:28 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.94001088
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 1088 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 1088\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:40:28 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 1088 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1088\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:40:28 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:40:37 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:40:48 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:41:06 [DEBUG] XGB validation: p-rmse: 0.203119\ta-peak@32: 0.731822\n", - "2024-09-05 13:41:06 [DEBUG] XGB iter 0: tr-p-rmse: 0.351379\ttr-a-peak@32: 0.838548\ttr-rmse: 0.483160\ttr-rmse: 0.483160\n", - "2024-09-05 13:41:06 [DEBUG] XGB iter 25: tr-p-rmse: 0.121979\ttr-a-peak@32: 0.955051\ttr-rmse: 0.542120\ttr-rmse: 0.542120\n", - "2024-09-05 13:41:06 [DEBUG] XGB iter 50: tr-p-rmse: 0.121979\ttr-a-peak@32: 0.955051\ttr-rmse: 0.542120\ttr-rmse: 0.542120\n", - "2024-09-05 13:41:06 [DEBUG] XGB stopped. Best iteration: [20] tr-p-rmse:0.12198\ttr-a-peak@32:0.95505\ttr-rmse:0.54212\ttr-rmse:0.54212 \n", - "2024-09-05 13:41:06 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.94001152
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 1152 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:41:07 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 1152 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1152\n", - "Total latency (us): 2.94001\n", - "\n", - "\n", - "Total trials: 1152\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:41:07 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:41:16 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:41:25 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:41:44 [DEBUG] XGB validation: p-rmse: 0.199220\ta-peak@32: 0.857107\n", - "2024-09-05 13:41:44 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.94001216
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 1216 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:41:44 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 1216 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1216\n", - "Total latency (us): 2.94001\n", - "\n", - "\n", - "Total trials: 1216\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:41:44 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:41:54 [INFO] [task_scheduler.cc:193] Sending 63 sample(s) to builder\n", - "2024-09-05 13:42:03 [INFO] [task_scheduler.cc:195] Sending 63 sample(s) to runner\n", - "2024-09-05 13:42:23 [DEBUG] XGB validation: p-rmse: 0.334811\ta-peak@32: 1.000000\n", - "2024-09-05 13:42:23 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.94001279
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 1279 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 1279\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:42:23 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 1279 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1279\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:42:23 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:42:33 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:42:41 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:43:00 [DEBUG] XGB validation: p-rmse: 0.186324\ta-peak@32: 0.994782\n", - "2024-09-05 13:43:00 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.94001343
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 1343 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:43:00 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 1343 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1343\n", - "Total latency (us): 2.94001\n", - "\n", - "\n", - "Total trials: 1343\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:43:00 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:43:11 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:43:22 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:43:42 [DEBUG] XGB validation: p-rmse: 0.175505\ta-peak@32: 0.876780\n", - "2024-09-05 13:43:42 [DEBUG] XGB iter 0: tr-p-rmse: 0.361002\ttr-a-peak@32: 0.873854\ttr-rmse: 0.477679\ttr-rmse: 0.477679\n", - "2024-09-05 13:43:42 [DEBUG] XGB iter 25: tr-p-rmse: 0.137002\ttr-a-peak@32: 0.962314\ttr-rmse: 0.537038\ttr-rmse: 0.537038\n", - "2024-09-05 13:43:42 [DEBUG] XGB iter 50: tr-p-rmse: 0.137002\ttr-a-peak@32: 0.962314\ttr-rmse: 0.537038\ttr-rmse: 0.537038\n", - "2024-09-05 13:43:42 [DEBUG] XGB stopped. Best iteration: [20] tr-p-rmse:0.13700\ttr-a-peak@32:0.96231\ttr-rmse:0.53704\ttr-rmse:0.53704 \n", - "2024-09-05 13:43:42 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.94001407
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 1407 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:43:42 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 1407 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1407\n", - "Total latency (us): 2.94001\n", - "\n", - "\n", - "Total trials: 1407\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:43:42 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:43:52 [INFO] [task_scheduler.cc:193] Sending 63 sample(s) to builder\n", - "2024-09-05 13:44:02 [INFO] [task_scheduler.cc:195] Sending 63 sample(s) to runner\n", - "2024-09-05 13:44:18 [DEBUG] XGB validation: p-rmse: 0.143981\ta-peak@32: 0.871370\n", - "2024-09-05 13:44:18 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.94001470
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 1470 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 1470\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:44:18 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 1470 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1470\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:44:18 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:44:27 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:44:43 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:45:01 [DEBUG] XGB validation: p-rmse: 0.161822\ta-peak@32: 0.944110\n", - "2024-09-05 13:45:01 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.94001534
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 1534 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 1534\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:45:01 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 1534 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1534\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:45:01 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:45:10 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:45:18 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:45:36 [DEBUG] XGB validation: p-rmse: 0.103546\ta-peak@32: 0.847592\n", - "2024-09-05 13:45:37 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.94001598
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 1598 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 1598\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:45:37 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 1598 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1598\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:45:37 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:45:45 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:45:54 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:46:14 [DEBUG] XGB validation: p-rmse: 0.164281\ta-peak@32: 0.878372\n", - "2024-09-05 13:46:14 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.94001662
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 1662 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:46:14 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 1662 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1662\n", - "Total latency (us): 2.94001\n", - "\n", - "\n", - "Total trials: 1662\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:46:14 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:46:25 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:46:35 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:46:54 [DEBUG] XGB validation: p-rmse: 0.102035\ta-peak@32: 0.991930\n", - "2024-09-05 13:46:54 [DEBUG] XGB iter 0: tr-p-rmse: 0.355092\ttr-a-peak@32: 0.882764\ttr-rmse: 0.481188\ttr-rmse: 0.481188\n", - "2024-09-05 13:46:54 [DEBUG] XGB iter 25: tr-p-rmse: 0.137222\ttr-a-peak@32: 0.969667\ttr-rmse: 0.540795\ttr-rmse: 0.540795\n", - "2024-09-05 13:46:55 [DEBUG] XGB iter 50: tr-p-rmse: 0.137222\ttr-a-peak@32: 0.969667\ttr-rmse: 0.540795\ttr-rmse: 0.540795\n", - "2024-09-05 13:46:55 [DEBUG] XGB stopped. Best iteration: [20] tr-p-rmse:0.13722\ttr-a-peak@32:0.96967\ttr-rmse:0.54079\ttr-rmse:0.54079 \n", - "2024-09-05 13:46:55 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74152.94002.94001726
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7415 2.9400 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9400 1726 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:46:55 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7415 | 2.9400 | 2.9400 | 1726 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1726\n", - "Total latency (us): 2.94001\n", - "\n", - "\n", - "Total trials: 1726\n", - "Total latency (us): 2.94001\n", - "\n", - "2024-09-05 13:46:55 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:47:04 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:47:14 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:47:34 [DEBUG] XGB validation: p-rmse: 0.265268\ta-peak@32: 0.779522\n", - "2024-09-05 13:47:34 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74192.93942.93941790
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7419 2.9394 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9394 1790 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:47:34 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7419 | 2.9394 | 2.9394 | 1790 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1790\n", - "Total latency (us): 2.93936\n", - "\n", - "\n", - "Total trials: 1790\n", - "Total latency (us): 2.93936\n", - "\n", - "2024-09-05 13:47:34 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:47:43 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:47:51 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:48:08 [DEBUG] XGB validation: p-rmse: 0.130673\ta-peak@32: 0.990733\n", - "2024-09-05 13:48:08 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74192.93942.93941854
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7419 2.9394 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9394 1854 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:48:08 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7419 | 2.9394 | 2.9394 | 1854 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1854\n", - "Total latency (us): 2.93936\n", - "\n", - "\n", - "Total trials: 1854\n", - "Total latency (us): 2.93936\n", - "\n", - "2024-09-05 13:48:08 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:48:20 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:48:29 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:48:48 [DEBUG] XGB validation: p-rmse: 0.154798\ta-peak@32: 0.758329\n", - "2024-09-05 13:48:48 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74192.93942.93941918
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7419 2.9394 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9394 1918 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 1918\n", - "Total latency (us): 2.93936\n", - "\n", - "2024-09-05 13:48:48 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7419 | 2.9394 | 2.9394 | 1918 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1918\n", - "Total latency (us): 2.93936\n", - "\n", - "2024-09-05 13:48:48 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:49:00 [INFO] [task_scheduler.cc:193] Sending 63 sample(s) to builder\n", - "2024-09-05 13:49:10 [INFO] [task_scheduler.cc:195] Sending 63 sample(s) to runner\n", - "2024-09-05 13:49:30 [DEBUG] XGB validation: p-rmse: 0.159362\ta-peak@32: 0.874314\n", - "2024-09-05 13:49:30 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74192.93942.93941981
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7419 2.9394 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9394 1981 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:49:30 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7419 | 2.9394 | 2.9394 | 1981 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 1981\n", - "Total latency (us): 2.93936\n", - "\n", - "\n", - "Total trials: 1981\n", - "Total latency (us): 2.93936\n", - "\n", - "2024-09-05 13:49:30 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"main\"\n", - "2024-09-05 13:49:40 [INFO] [task_scheduler.cc:193] Sending 19 sample(s) to builder\n", - "2024-09-05 13:49:45 [INFO] [task_scheduler.cc:195] Sending 19 sample(s) to runner\n", - "2024-09-05 13:49:51 [DEBUG] XGB validation: p-rmse: 0.176777\ta-peak@32: 0.810652\n", - "2024-09-05 13:49:51 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"main\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74192.93942.93942000
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7419 2.9394 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9394 2000 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 2000\n", - "Total latency (us): 2.93936\n", - "\n", - "2024-09-05 13:49:51 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7419 | 2.9394 | 2.9394 | 2000 | \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 2000\n", - "Total latency (us): 2.93936\n", - "\n", - "2024-09-05 13:49:51 [INFO] [task_scheduler.cc:260] Task #0 has finished. Remaining task(s): 0\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0main512011.74192.93942.93942000Y
\n", - "
" - ], - "text/plain": [ - " Name FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 main 5120 1 1.7419 2.9394 \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 2.9394 2000 Y " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:49:51 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "---------------------------------------------------------------------------------------------------\n", - " 0 | main | 5120 | 1 | 1.7419 | 2.9394 | 2.9394 | 2000 | Y \n", - "---------------------------------------------------------------------------------------------------\n", - "Total trials: 2000\n", - "Total latency (us): 2.93936\n", - "\n", - "\n", - "Total trials: 2000\n", - "Total latency (us): 2.93936\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[13:49:51] /media/pc/data/lxw/ai/tvm/src/relax/transform/meta_schedule.cc:119: Warning: Creating JSONDatabase. Workload at: /tmp/tmp081zz3q0/database_workload.json, Tuning records at: /tmp/tmp081zz3q0/database_tuning_record.json\n" - ] - }, - { - "data": { - "text/html": [ - "
# from tvm.script import ir as I\n",
-       "# from tvm.script import tir as T\n",
-       "# from tvm.script import relax as R\n",
-       "\n",
-       "@I.ir_module\n",
-       "class Module:\n",
-       "    I.module_attrs({"external_mods": [metadata["runtime.Module"][0]]})\n",
-       "    @T.prim_func(private=True)\n",
-       "    def matmul(lv: T.Buffer((T.int64(1), T.int64(256)), "float32"), permute_dims1: T.Buffer((T.int64(256), T.int64(10)), "float32"), matmul: T.Buffer((T.int64(1), T.int64(10)), "float32")):\n",
-       "        T.func_attr({"op_pattern": 4, "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)})\n",
-       "        # with T.block("root"):\n",
-       "        matmul_local = T.alloc_buffer((T.int64(1), T.int64(10)), scope="local")\n",
-       "        lv_shared = T.alloc_buffer((T.int64(1), T.int64(256)), scope="shared")\n",
-       "        permute_dims1_shared = T.alloc_buffer((T.int64(256), T.int64(10)), scope="shared")\n",
-       "        for i0_0_i1_0_fused in T.thread_binding(T.int64(1), thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step": 1024, "pragma_unroll_explicit": 1}):\n",
-       "            for i0_1_i1_1_fused in T.thread_binding(T.int64(1), thread="vthread.x"):\n",
-       "                for i0_2_i1_2_fused in T.thread_binding(T.int64(10), thread="threadIdx.x"):\n",
-       "                    for i0_3_init, i1_3_init, i0_4_init, i1_4_init in T.grid(T.int64(1), T.int64(1), T.int64(1), T.int64(1)):\n",
-       "                        with T.block("matmul_init"):\n",
-       "                            v_i0 = T.axis.spatial(T.int64(1), i0_3_init + i0_4_init)\n",
-       "                            v_i1 = T.axis.spatial(T.int64(10), i0_2_i1_2_fused + i1_3_init + i1_4_init)\n",
-       "                            T.reads()\n",
-       "                            T.writes(matmul_local[v_i0, v_i1])\n",
-       "                            T.block_attr({"meta_schedule.thread_extent_high_inclusive": 1024, "meta_schedule.thread_extent_low_inclusive": 1, "meta_schedule.tiling_structure": "SSSRRSRS"})\n",
-       "                            matmul_local[v_i0, v_i1] = T.float32(0.0)\n",
-       "                    for k_0 in range(T.int64(1)):\n",
-       "                        for ax0_ax1_fused_0 in range(T.int64(7)):\n",
-       "                            for ax0_ax1_fused_1 in T.thread_binding(T.int64(10), thread="threadIdx.x"):\n",
-       "                                for ax0_ax1_fused_2 in T.vectorized(T.int64(4)):\n",
-       "                                    with T.block("lv_shared"):\n",
-       "                                        v0 = T.axis.spatial(T.int64(1), T.int64(0))\n",
-       "                                        v1 = T.axis.spatial(T.int64(256), ax0_ax1_fused_0 * T.int64(40) + ax0_ax1_fused_1 * T.int64(4) + ax0_ax1_fused_2)\n",
-       "                                        T.where((ax0_ax1_fused_0 * T.int64(10) + ax0_ax1_fused_1) * T.int64(4) + ax0_ax1_fused_2 < T.int64(256))\n",
-       "                                        T.reads(lv[v0, v1])\n",
-       "                                        T.writes(lv_shared[v0, v1])\n",
-       "                                        lv_shared[v0, v1] = lv[v0, v1]\n",
-       "                        for ax0_ax1_fused_0 in range(T.int64(64)):\n",
-       "                            for ax0_ax1_fused_1 in T.thread_binding(T.int64(10), thread="threadIdx.x"):\n",
-       "                                for ax0_ax1_fused_2 in T.vectorized(T.int64(4)):\n",
-       "                                    with T.block("permute_dims1_shared"):\n",
-       "                                        v0 = T.axis.spatial(T.int64(256), (ax0_ax1_fused_0 * T.int64(40) + ax0_ax1_fused_1 * T.int64(4) + ax0_ax1_fused_2) // T.int64(10))\n",
-       "                                        v1 = T.axis.spatial(T.int64(10), (ax0_ax1_fused_0 * T.int64(40) + ax0_ax1_fused_1 * T.int64(4) + ax0_ax1_fused_2) % T.int64(10))\n",
-       "                                        T.reads(permute_dims1[v0, v1])\n",
-       "                                        T.writes(permute_dims1_shared[v0, v1])\n",
-       "                                        permute_dims1_shared[v0, v1] = permute_dims1[v0, v1]\n",
-       "                        for k_1, i0_3, i1_3, k_2, i0_4, i1_4 in T.grid(T.int64(8), T.int64(1), T.int64(1), T.int64(32), T.int64(1), T.int64(1)):\n",
-       "                            with T.block("matmul_update"):\n",
-       "                                v_i0 = T.axis.spatial(T.int64(1), i0_3 + i0_4)\n",
-       "                                v_i1 = T.axis.spatial(T.int64(10), i0_2_i1_2_fused + i1_3 + i1_4)\n",
-       "                                v_k = T.axis.reduce(T.int64(256), k_0 * T.int64(256) + k_1 * T.int64(32) + k_2)\n",
-       "                                T.reads(matmul_local[v_i0, v_i1], lv_shared[v_i0, v_k], permute_dims1_shared[v_k, v_i1])\n",
-       "                                T.writes(matmul_local[v_i0, v_i1])\n",
-       "                                T.block_attr({"meta_schedule.thread_extent_high_inclusive": 1024, "meta_schedule.thread_extent_low_inclusive": 1, "meta_schedule.tiling_structure": "SSSRRSRS"})\n",
-       "                                matmul_local[v_i0, v_i1] = matmul_local[v_i0, v_i1] + lv_shared[v_i0, v_k] * permute_dims1_shared[v_k, v_i1]\n",
-       "                    for ax0, ax1 in T.grid(T.int64(1), T.int64(1)):\n",
-       "                        with T.block("matmul_local"):\n",
-       "                            v0 = T.axis.spatial(T.int64(1), ax0)\n",
-       "                            v1 = T.axis.spatial(T.int64(10), i0_2_i1_2_fused + ax1)\n",
-       "                            T.reads(matmul_local[v0, v1])\n",
-       "                            T.writes(matmul[v0, v1])\n",
-       "                            matmul[v0, v1] = matmul_local[v0, v1]\n",
-       "\n",
-       "    @T.prim_func(private=True)\n",
-       "    def transpose(fc2_weight: T.Buffer((T.int64(10), T.int64(256)), "float32"), T_transpose: T.Buffer((T.int64(256), T.int64(10)), "float32")):\n",
-       "        T.func_attr({"op_pattern": 2, "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)})\n",
-       "        # with T.block("root"):\n",
-       "        for ax0_ax1_fused_0 in T.thread_binding(T.int64(3), thread="blockIdx.x"):\n",
-       "            for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):\n",
-       "                with T.block("T_transpose"):\n",
-       "                    v_ax0 = T.axis.spatial(T.int64(256), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(10))\n",
-       "                    v_ax1 = T.axis.spatial(T.int64(10), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(10))\n",
-       "                    T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < T.int64(2560))\n",
-       "                    T.reads(fc2_weight[v_ax1, v_ax0])\n",
-       "                    T.writes(T_transpose[v_ax0, v_ax1])\n",
-       "                    T_transpose[v_ax0, v_ax1] = fc2_weight[v_ax1, v_ax0]\n",
-       "\n",
-       "    @R.function\n",
-       "    def forward(x: R.Tensor((1, 784), dtype="float32"), fc1_weight: R.Tensor((256, 784), dtype="float32"), fc1_bias: R.Tensor((256,), dtype="float32"), fc2_weight: R.Tensor((10, 256), dtype="float32")) -> R.Tensor((1, 10), dtype="float32"):\n",
-       "        R.func_attr({"num_input": 1})\n",
-       "        cls = Module\n",
-       "        with R.dataflow():\n",
-       "            lv = R.call_dps_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_relu_cublas", (fc1_weight, x, fc1_bias), out_sinfo=R.Tensor((1, 256), dtype="float32"))\n",
-       "            permute_dims1 = R.call_tir(cls.transpose, (fc2_weight,), out_sinfo=R.Tensor((256, 10), dtype="float32"))\n",
-       "            gv = R.call_tir(cls.matmul, (lv, permute_dims1), out_sinfo=R.Tensor((1, 10), dtype="float32"))\n",
-       "            R.output(gv)\n",
-       "        return gv\n",
-       "\n",
-       "# Metadata omitted. Use show_meta=True in script() method to show it.\n",
-       "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "device = tvm.cuda(0)\n", "target = tvm.target.Target.from_device(device)\n", diff --git a/doc/tutorials/e2e-opt.ipynb b/doc/tutorials/e2e-opt.ipynb index 4f84a415..8989cf80 100644 --- a/doc/tutorials/e2e-opt.ipynb +++ b/doc/tutorials/e2e-opt.ipynb @@ -231,32946 +231,13 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:26:24 [INFO] Logging directory: tuning_logs/logs\n", - "2024-09-05 13:26:44 [INFO] LocalBuilder: max_workers = 24\n", - "2024-09-05 13:26:46 [INFO] LocalRunner: max_workers = 1\n", - "2024-09-05 13:26:47 [INFO] [task_scheduler.cc:159] Initializing Task #0: \"fused_matmul_add13\"\n", - "2024-09-05 13:26:47 [INFO] [task_scheduler.cc:159] Initializing Task #1: \"transpose\"\n", - "2024-09-05 13:26:47 [INFO] [task_scheduler.cc:159] Initializing Task #2: \"reshape\"\n", - "2024-09-05 13:26:47 [INFO] [task_scheduler.cc:159] Initializing Task #3: \"adaptive_avg_pool2d\"\n", - "2024-09-05 13:26:47 [INFO] [task_scheduler.cc:159] Initializing Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[13:26:47] /media/pc/data/lxw/ai/tvm/src/meta_schedule/schedule_rule/apply_custom_rule.cc:56: Warning: Unknown schedule rule \"meta_schedule.adaptive_pool_avg\" for target keys \"[\"cuda\", \"gpu\"]\". Checked PackedFuncs:\n", - " meta_schedule.cuda.meta_schedule.adaptive_pool_avg\n", - " meta_schedule.gpu.meta_schedule.adaptive_pool_avg\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:26:48 [INFO] [task_scheduler.cc:159] Initializing Task #5: \"fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu\"\n", - "2024-09-05 13:26:48 [INFO] [task_scheduler.cc:159] Initializing Task #6: \"fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1\"\n", - "2024-09-05 13:26:48 [INFO] [task_scheduler.cc:159] Initializing Task #7: \"fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 13:26:49 [INFO] [task_scheduler.cc:159] Initializing Task #8: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4\"\n", - "2024-09-05 13:26:49 [INFO] [task_scheduler.cc:159] Initializing Task #9: \"fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2\"\n", - "2024-09-05 13:26:49 [INFO] [task_scheduler.cc:159] Initializing Task #10: \"fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11\"\n", - "2024-09-05 13:26:50 [INFO] [task_scheduler.cc:159] Initializing Task #11: \"max_pool2d\"\n", - "2024-09-05 13:26:50 [INFO] [task_scheduler.cc:159] Initializing Task #12: \"fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3\"\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[13:26:50] /media/pc/data/lxw/ai/tvm/src/meta_schedule/schedule_rule/apply_custom_rule.cc:56: Warning: Unknown schedule rule \"meta_schedule.pool_max\" for target keys \"[\"cuda\", \"gpu\"]\". Checked PackedFuncs:\n", - " meta_schedule.cuda.meta_schedule.pool_max\n", - " meta_schedule.gpu.meta_schedule.pool_max\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:26:50 [INFO] [task_scheduler.cc:159] Initializing Task #13: \"fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5\"\n", - "2024-09-05 13:26:50 [INFO] [task_scheduler.cc:159] Initializing Task #14: \"fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1\"\n", - "2024-09-05 13:26:51 [INFO] [task_scheduler.cc:159] Initializing Task #15: \"fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2\"\n", - "2024-09-05 13:26:51 [INFO] [task_scheduler.cc:159] Initializing Task #16: \"fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3\"\n", - "2024-09-05 13:26:52 [INFO] [task_scheduler.cc:159] Initializing Task #17: \"fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8\"\n", - "2024-09-05 13:26:52 [INFO] [task_scheduler.cc:159] Initializing Task #18: \"fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2\"\n", - "2024-09-05 13:26:52 [INFO] [task_scheduler.cc:159] Initializing Task #19: \"fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001N/AN/AN/A0
1transpose11N/AN/AN/A0
2reshape11N/AN/AN/A0
3adaptive_avg_pool2d256001N/AN/AN/A0
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu42313364481N/AN/AN/A0
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A0
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A0
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A0
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A0
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A0
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A0
11max_pool2d18063361N/AN/AN/A0
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 N/A N/A \n", - "1 1 1 N/A N/A \n", - "2 1 1 N/A N/A \n", - "3 25600 1 N/A N/A \n", - "4 231336448 1 N/A N/A \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 N/A 0 \n", - "1 N/A 0 \n", - "2 N/A 0 \n", - "3 N/A 0 \n", - "4 N/A 0 \n", - "5 N/A 0 \n", - "6 N/A 0 \n", - "7 N/A 0 \n", - "8 N/A 0 \n", - "9 N/A 0 \n", - "10 N/A 0 \n", - "11 N/A 0 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 0\n", - "Total latency (us): 0\n", - "\n", - "2024-09-05 13:26:53 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | N/A | N/A | N/A | 0 | \n", - " 1 | transpose | 1 | 1 | N/A | N/A | N/A | 0 | \n", - " 2 | reshape | 1 | 1 | N/A | N/A | N/A | 0 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | N/A | N/A | N/A | 0 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | N/A | N/A | N/A | 0 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 0 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 0 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 0 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 0 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 0 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 0 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 0 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 0\n", - "Total latency (us): 0\n", - "\n", - "2024-09-05 13:26:53 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"fused_matmul_add13\"\n", - "2024-09-05 13:27:03 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:27:11 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:27:33 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #1: \"transpose\"\n", - "2024-09-05 13:27:34 [INFO] [task_scheduler.cc:193] Sending 1 sample(s) to builder\n", - "2024-09-05 13:27:35 [INFO] [task_scheduler.cc:195] Sending 1 sample(s) to runner\n", - "2024-09-05 13:27:36 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #2: \"reshape\"\n", - "2024-09-05 13:27:36 [INFO] [task_scheduler.cc:193] Sending 5 sample(s) to builder\n", - "2024-09-05 13:27:38 [INFO] [task_scheduler.cc:195] Sending 5 sample(s) to runner\n", - "2024-09-05 13:27:40 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #3: \"adaptive_avg_pool2d\"\n", - "2024-09-05 13:27:42 [INFO] [task_scheduler.cc:193] Sending 62 sample(s) to builder\n", - "2024-09-05 13:27:48 [INFO] [task_scheduler.cc:195] Sending 62 sample(s) to runner\n", - "2024-09-05 13:28:03 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 13:28:43 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:29:16 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:29:33 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #5: \"fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu\"\n", - "2024-09-05 13:30:20 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:31:00 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:31:04 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #6: \"fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1\"\n", - "2024-09-05 13:31:48 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:32:30 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:32:36 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #7: \"fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 13:33:14 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:33:34 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:33:39 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #8: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4\"\n", - "2024-09-05 13:34:29 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:35:03 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:35:05 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #9: \"fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2\"\n", - "2024-09-05 13:35:53 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:36:23 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:36:26 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #10: \"fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11\"\n", - "2024-09-05 13:36:49 [INFO] [task_scheduler.cc:193] Sending 63 sample(s) to builder\n", - "2024-09-05 13:37:17 [INFO] [task_scheduler.cc:195] Sending 63 sample(s) to runner\n", - "2024-09-05 13:37:20 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #11: \"max_pool2d\"\n", - "2024-09-05 13:37:26 [INFO] [task_scheduler.cc:193] Sending 62 sample(s) to builder\n", - "2024-09-05 13:37:34 [INFO] [task_scheduler.cc:195] Sending 62 sample(s) to runner\n", - "2024-09-05 13:37:37 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #12: \"fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3\"\n", - "2024-09-05 13:38:24 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:39:02 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:39:05 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #13: \"fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5\"\n", - "2024-09-05 13:39:29 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:39:55 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:39:57 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #14: \"fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1\"\n", - "2024-09-05 13:40:46 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:41:16 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:41:21 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #15: \"fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2\"\n", - "2024-09-05 13:42:04 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:42:44 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:42:47 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #16: \"fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3\"\n", - "2024-09-05 13:43:27 [INFO] [task_scheduler.cc:193] Sending 63 sample(s) to builder\n", - "2024-09-05 13:44:09 [INFO] [task_scheduler.cc:195] Sending 63 sample(s) to runner\n", - "2024-09-05 13:44:16 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #17: \"fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8\"\n", - "2024-09-05 13:44:37 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:45:13 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:45:16 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #18: \"fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2\"\n", - "2024-09-05 13:46:06 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:46:38 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:46:41 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #19: \"fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3\"\n", - "2024-09-05 13:47:23 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:47:47 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:47:50 [DEBUG] XGB iter 0: tr-p-rmse: 0.482814\ttr-a-peak@32: 0.968058\ttr-rmse: 0.335249\ttr-rmse: 0.335249\n", - "2024-09-05 13:47:50 [DEBUG] XGB iter 25: tr-p-rmse: 0.035264\ttr-a-peak@32: 0.999915\ttr-rmse: 0.375617\ttr-rmse: 0.375617\n", - "2024-09-05 13:47:50 [DEBUG] XGB iter 50: tr-p-rmse: 0.035264\ttr-a-peak@32: 0.999915\ttr-rmse: 0.375617\ttr-rmse: 0.375617\n", - "2024-09-05 13:47:50 [DEBUG] XGB stopped. Best iteration: [12] tr-p-rmse:0.03526\ttr-a-peak@32:0.99991\ttr-rmse:0.37562\ttr-rmse:0.37562 \n", - "2024-09-05 13:47:50 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"fused_matmul_add13\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose11N/AN/AN/A0
2reshape11N/AN/AN/A0
3adaptive_avg_pool2d256001N/AN/AN/A0
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu42313364481N/AN/AN/A0
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A0
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A0
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A0
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A0
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A0
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A0
11max_pool2d18063361N/AN/AN/A0
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 N/A N/A \n", - "2 1 1 N/A N/A \n", - "3 25600 1 N/A N/A \n", - "4 231336448 1 N/A N/A \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 N/A 0 \n", - "2 N/A 0 \n", - "3 N/A 0 \n", - "4 N/A 0 \n", - "5 N/A 0 \n", - "6 N/A 0 \n", - "7 N/A 0 \n", - "8 N/A 0 \n", - "9 N/A 0 \n", - "10 N/A 0 \n", - "11 N/A 0 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:47:50 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | N/A | N/A | N/A | 0 | \n", - " 2 | reshape | 1 | 1 | N/A | N/A | N/A | 0 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | N/A | N/A | N/A | 0 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | N/A | N/A | N/A | 0 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 0 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 0 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 0 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 0 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 0 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 0 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 0 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 64\n", - "Total latency (us): 9.42151\n", - "\n", - "\n", - "Total trials: 64\n", - "Total latency (us): 9.42151\n", - "\n", - "2024-09-05 13:47:50 [INFO] [task_scheduler.cc:237] [Updated] Task #1: \"transpose\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape11N/AN/AN/A0
3adaptive_avg_pool2d256001N/AN/AN/A0
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu42313364481N/AN/AN/A0
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A0
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A0
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A0
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A0
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A0
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A0
11max_pool2d18063361N/AN/AN/A0
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 N/A N/A \n", - "3 25600 1 N/A N/A \n", - "4 231336448 1 N/A N/A \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 N/A 0 \n", - "3 N/A 0 \n", - "4 N/A 0 \n", - "5 N/A 0 \n", - "6 N/A 0 \n", - "7 N/A 0 \n", - "8 N/A 0 \n", - "9 N/A 0 \n", - "10 N/A 0 \n", - "11 N/A 0 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:47:50 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | N/A | N/A | N/A | 0 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | N/A | N/A | N/A | 0 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | N/A | N/A | N/A | 0 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 0 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 0 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 0 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 0 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 0 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 0 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 0 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 65\n", - "Total latency (us): 19.7303\n", - "\n", - "\n", - "Total trials: 65\n", - "Total latency (us): 19.7303\n", - "\n", - "2024-09-05 13:47:50 [INFO] [task_scheduler.cc:237] [Updated] Task #2: \"reshape\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d256001N/AN/AN/A0
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu42313364481N/AN/AN/A0
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A0
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A0
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A0
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A0
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A0
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A0
11max_pool2d18063361N/AN/AN/A0
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 N/A N/A \n", - "4 231336448 1 N/A N/A \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 N/A 0 \n", - "4 N/A 0 \n", - "5 N/A 0 \n", - "6 N/A 0 \n", - "7 N/A 0 \n", - "8 N/A 0 \n", - "9 N/A 0 \n", - "10 N/A 0 \n", - "11 N/A 0 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:47:50 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | N/A | N/A | N/A | 0 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | N/A | N/A | N/A | 0 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 0 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 0 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 0 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 0 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 0 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 0 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 0 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 70\n", - "Total latency (us): 23.0001\n", - "\n", - "\n", - "Total trials: 70\n", - "Total latency (us): 23.0001\n", - "\n", - "2024-09-05 13:47:50 [DEBUG] XGB iter 0: tr-p-rmse: 0.461847\ttr-a-peak@32: 1.000000\ttr-rmse: 0.315594\ttr-rmse: 0.315594\n", - "2024-09-05 13:47:50 [DEBUG] XGB iter 25: tr-p-rmse: 0.056835\ttr-a-peak@32: 0.999915\ttr-rmse: 0.367730\ttr-rmse: 0.367730\n", - "2024-09-05 13:47:50 [DEBUG] XGB iter 50: tr-p-rmse: 0.056837\ttr-a-peak@32: 0.999915\ttr-rmse: 0.367728\ttr-rmse: 0.367728\n", - "2024-09-05 13:47:50 [DEBUG] XGB stopped. Best iteration: [21] tr-p-rmse:0.05677\ttr-a-peak@32:0.99991\ttr-rmse:0.36781\ttr-rmse:0.36781 \n", - "2024-09-05 13:47:50 [INFO] [task_scheduler.cc:237] [Updated] Task #3: \"adaptive_avg_pool2d\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu42313364481N/AN/AN/A0
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A0
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A0
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A0
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A0
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A0
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A0
11max_pool2d18063361N/AN/AN/A0
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 N/A N/A \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 N/A 0 \n", - "5 N/A 0 \n", - "6 N/A 0 \n", - "7 N/A 0 \n", - "8 N/A 0 \n", - "9 N/A 0 \n", - "10 N/A 0 \n", - "11 N/A 0 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 132\n", - "Total latency (us): 25.21\n", - "\n", - "2024-09-05 13:47:50 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | N/A | N/A | N/A | 0 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 0 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 0 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 0 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 0 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 0 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 0 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 0 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 132\n", - "Total latency (us): 25.21\n", - "\n", - "2024-09-05 13:47:51 [DEBUG] XGB iter 0: tr-p-rmse: 0.631297\ttr-a-peak@32: 0.992267\ttr-rmse: 0.364246\ttr-rmse: 0.364246\n", - "2024-09-05 13:47:51 [DEBUG] XGB iter 25: tr-p-rmse: 0.249681\ttr-a-peak@32: 0.968750\ttr-rmse: 0.390801\ttr-rmse: 0.390801\n", - "2024-09-05 13:47:51 [DEBUG] XGB iter 50: tr-p-rmse: 0.249683\ttr-a-peak@32: 0.968750\ttr-rmse: 0.390800\ttr-rmse: 0.390800\n", - "2024-09-05 13:47:51 [DEBUG] XGB stopped. Best iteration: [15] tr-p-rmse:0.24938\ttr-a-peak@32:0.96875\ttr-rmse:0.39097\ttr-rmse:0.39097 \n", - "2024-09-05 13:47:52 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A0
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A0
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A0
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A0
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A0
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A0
11max_pool2d18063361N/AN/AN/A0
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 0 \n", - "6 N/A 0 \n", - "7 N/A 0 \n", - "8 N/A 0 \n", - "9 N/A 0 \n", - "10 N/A 0 \n", - "11 N/A 0 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:47:52 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 0 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 0 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 0 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 0 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 0 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 0 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 0 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:47:53 [DEBUG] XGB iter 0: tr-p-rmse: 0.610195\ttr-a-peak@32: 0.899544\ttr-rmse: 0.690948\ttr-rmse: 0.690948\n", - "2024-09-05 13:47:53 [DEBUG] XGB iter 25: tr-p-rmse: 0.162857\ttr-a-peak@32: 1.000000\ttr-rmse: 0.692006\ttr-rmse: 0.692006\n", - "2024-09-05 13:47:53 [DEBUG] XGB iter 50: tr-p-rmse: 0.162858\ttr-a-peak@32: 1.000000\ttr-rmse: 0.692006\ttr-rmse: 0.692006\n", - "2024-09-05 13:47:53 [DEBUG] XGB stopped. Best iteration: [15] tr-p-rmse:0.16278\ttr-a-peak@32:1.00000\ttr-rmse:0.69206\ttr-rmse:0.69206 \n", - "2024-09-05 13:47:54 [INFO] [task_scheduler.cc:237] [Updated] Task #5: \"fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A0
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A0
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A0
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A0
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A0
11max_pool2d18063361N/AN/AN/A0
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 0 \n", - "7 N/A 0 \n", - "8 N/A 0 \n", - "9 N/A 0 \n", - "10 N/A 0 \n", - "11 N/A 0 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:47:54 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 0 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 0 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 0 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 0 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 0 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 0 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:47:55 [DEBUG] XGB iter 0: tr-p-rmse: 0.540059\ttr-a-peak@32: 0.906250\ttr-rmse: 0.737210\ttr-rmse: 0.737210\n", - "2024-09-05 13:47:55 [DEBUG] XGB iter 25: tr-p-rmse: 0.153869\ttr-a-peak@32: 1.000000\ttr-rmse: 0.737761\ttr-rmse: 0.737761\n", - "2024-09-05 13:47:55 [DEBUG] XGB iter 50: tr-p-rmse: 0.153869\ttr-a-peak@32: 1.000000\ttr-rmse: 0.737761\ttr-rmse: 0.737761\n", - "2024-09-05 13:47:55 [DEBUG] XGB stopped. Best iteration: [11] tr-p-rmse:0.15383\ttr-a-peak@32:1.00000\ttr-rmse:0.73782\ttr-rmse:0.73782 \n", - "2024-09-05 13:47:56 [INFO] [task_scheduler.cc:237] [Updated] Task #6: \"fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A0
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A0
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A0
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A0
11max_pool2d18063361N/AN/AN/A0
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 0 \n", - "8 N/A 0 \n", - "9 N/A 0 \n", - "10 N/A 0 \n", - "11 N/A 0 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:47:56 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 0 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 0 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 0 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 0 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 0 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:47:57 [INFO] [task_scheduler.cc:237] [Updated] Task #7: \"fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A0
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A0
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A0
11max_pool2d18063361N/AN/AN/A0
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 0 \n", - "9 N/A 0 \n", - "10 N/A 0 \n", - "11 N/A 0 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:47:57 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 0 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 0 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 0 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 0 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:47:58 [DEBUG] XGB iter 0: tr-p-rmse: 0.446722\ttr-a-peak@32: 0.991325\ttr-rmse: 0.765663\ttr-rmse: 0.765663\n", - "2024-09-05 13:47:59 [DEBUG] XGB iter 25: tr-p-rmse: 0.284486\ttr-a-peak@32: 1.000000\ttr-rmse: 0.766375\ttr-rmse: 0.766375\n", - "2024-09-05 13:47:59 [DEBUG] XGB iter 50: tr-p-rmse: 0.284486\ttr-a-peak@32: 1.000000\ttr-rmse: 0.766375\ttr-rmse: 0.766375\n", - "2024-09-05 13:47:59 [DEBUG] XGB stopped. Best iteration: [21] tr-p-rmse:0.28447\ttr-a-peak@32:1.00000\ttr-rmse:0.76638\ttr-rmse:0.76638 \n", - "2024-09-05 13:48:00 [INFO] [task_scheduler.cc:237] [Updated] Task #8: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A0
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A0
11max_pool2d18063361N/AN/AN/A0
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 0 \n", - "10 N/A 0 \n", - "11 N/A 0 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:00 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 0 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 0 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 0 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:01 [INFO] [task_scheduler.cc:237] [Updated] Task #9: \"fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A0
11max_pool2d18063361N/AN/AN/A0
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 0 \n", - "11 N/A 0 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:48:01 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 0 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 0 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:02 [DEBUG] XGB iter 0: tr-p-rmse: 0.389224\ttr-a-peak@32: 0.991325\ttr-rmse: 0.776519\ttr-rmse: 0.776519\n", - "2024-09-05 13:48:02 [DEBUG] XGB iter 25: tr-p-rmse: 0.243136\ttr-a-peak@32: 1.000000\ttr-rmse: 0.777008\ttr-rmse: 0.777008\n", - "2024-09-05 13:48:03 [DEBUG] XGB iter 50: tr-p-rmse: 0.243136\ttr-a-peak@32: 1.000000\ttr-rmse: 0.777008\ttr-rmse: 0.777008\n", - "2024-09-05 13:48:03 [DEBUG] XGB stopped. Best iteration: [21] tr-p-rmse:0.24313\ttr-a-peak@32:1.00000\ttr-rmse:0.77701\ttr-rmse:0.77701 \n", - "2024-09-05 13:48:03 [INFO] [task_scheduler.cc:237] [Updated] Task #10: \"fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A0
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 0 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:03 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 0 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:04 [INFO] [task_scheduler.cc:237] [Updated] Task #11: \"max_pool2d\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A0
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 0 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:04 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 0 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:05 [DEBUG] XGB iter 0: tr-p-rmse: 0.368476\ttr-a-peak@32: 0.996597\ttr-rmse: 0.771927\ttr-rmse: 0.771927\n", - "2024-09-05 13:48:05 [DEBUG] XGB iter 25: tr-p-rmse: 0.224196\ttr-a-peak@32: 1.000000\ttr-rmse: 0.769941\ttr-rmse: 0.769941\n", - "2024-09-05 13:48:05 [DEBUG] XGB iter 50: tr-p-rmse: 0.224196\ttr-a-peak@32: 1.000000\ttr-rmse: 0.769941\ttr-rmse: 0.769941\n", - "2024-09-05 13:48:06 [DEBUG] XGB stopped. Best iteration: [24] tr-p-rmse:0.22420\ttr-a-peak@32:1.00000\ttr-rmse:0.76994\ttr-rmse:0.76994 \n", - "2024-09-05 13:48:06 [INFO] [task_scheduler.cc:237] [Updated] Task #12: \"fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A0
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 0 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:06 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 0 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:07 [INFO] [task_scheduler.cc:237] [Updated] Task #13: \"fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A0
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 0 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:07 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 0 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:08 [INFO] [task_scheduler.cc:237] [Updated] Task #14: \"fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A0
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 0 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:48:09 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 0 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:09 [DEBUG] XGB iter 0: tr-p-rmse: 0.321691\ttr-a-peak@32: 0.997170\ttr-rmse: 0.779901\ttr-rmse: 0.779901\n", - "2024-09-05 13:48:09 [DEBUG] XGB iter 25: tr-p-rmse: 0.196667\ttr-a-peak@32: 1.000000\ttr-rmse: 0.778481\ttr-rmse: 0.778481\n", - "2024-09-05 13:48:09 [DEBUG] XGB iter 50: tr-p-rmse: 0.196668\ttr-a-peak@32: 1.000000\ttr-rmse: 0.778480\ttr-rmse: 0.778480\n", - "2024-09-05 13:48:10 [DEBUG] XGB stopped. Best iteration: [24] tr-p-rmse:0.19665\ttr-a-peak@32:1.00000\ttr-rmse:0.77849\ttr-rmse:0.77849 \n", - "2024-09-05 13:48:10 [INFO] [task_scheduler.cc:237] [Updated] Task #15: \"fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A0
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 0 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "2024-09-05 13:48:10 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 0 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "2024-09-05 13:48:11 [INFO] [task_scheduler.cc:237] [Updated] Task #16: \"fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A0
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 0 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:11 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 0 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:12 [INFO] [task_scheduler.cc:237] [Updated] Task #17: \"fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A0
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 0 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:12 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 0 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:13 [DEBUG] XGB iter 0: tr-p-rmse: 0.288673\ttr-a-peak@32: 0.996597\ttr-rmse: 0.784277\ttr-rmse: 0.784277\n", - "2024-09-05 13:48:13 [DEBUG] XGB iter 25: tr-p-rmse: 0.174178\ttr-a-peak@32: 1.000000\ttr-rmse: 0.783190\ttr-rmse: 0.783190\n", - "2024-09-05 13:48:13 [DEBUG] XGB iter 50: tr-p-rmse: 0.174178\ttr-a-peak@32: 1.000000\ttr-rmse: 0.783190\ttr-rmse: 0.783190\n", - "2024-09-05 13:48:13 [DEBUG] XGB stopped. Best iteration: [24] tr-p-rmse:0.17418\ttr-a-peak@32:1.00000\ttr-rmse:0.78319\ttr-rmse:0.78319 \n", - "2024-09-05 13:48:13 [INFO] [task_scheduler.cc:237] [Updated] Task #18: \"fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A0
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:48:13 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 0 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:14 [INFO] [task_scheduler.cc:237] [Updated] Task #19: \"fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.125564
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 64 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:14 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 64 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 196\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:48:14 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 13:49:20 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:49:41 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:49:44 [DEBUG] XGB validation: p-rmse: 0.999471\ta-peak@32: 1.000000\n", - "2024-09-05 13:49:45 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255128
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 128 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 260\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:49:45 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 128 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 260\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:49:45 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 13:50:41 [INFO] [task_scheduler.cc:193] Sending 63 sample(s) to builder\n", - "2024-09-05 13:50:51 [INFO] [task_scheduler.cc:195] Sending 63 sample(s) to runner\n", - "2024-09-05 13:50:53 [DEBUG] XGB validation: p-rmse: 0.999837\ta-peak@32: 1.000000\n", - "2024-09-05 13:50:54 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255191
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 191 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:50:54 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 191 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 323\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 323\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:50:54 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 13:51:51 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:52:03 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:52:06 [DEBUG] XGB validation: p-rmse: 0.995014\ta-peak@32: 1.000000\n", - "2024-09-05 13:52:06 [DEBUG] XGB iter 0: tr-p-rmse: 0.469885\ttr-a-peak@32: 0.996884\ttr-rmse: 0.785376\ttr-rmse: 0.785376\n", - "2024-09-05 13:52:07 [DEBUG] XGB iter 25: tr-p-rmse: 0.416552\ttr-a-peak@32: 0.906250\ttr-rmse: 0.784374\ttr-rmse: 0.784374\n", - "2024-09-05 13:52:07 [DEBUG] XGB iter 50: tr-p-rmse: 0.416552\ttr-a-peak@32: 0.906250\ttr-rmse: 0.784374\ttr-rmse: 0.784374\n", - "2024-09-05 13:52:07 [DEBUG] XGB stopped. Best iteration: [21] tr-p-rmse:0.41654\ttr-a-peak@32:0.90625\ttr-rmse:0.78438\ttr-rmse:0.78438 \n", - "2024-09-05 13:52:07 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255255
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 255 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 387\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:52:07 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 255 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 387\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:52:07 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 13:53:01 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:53:14 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:53:17 [DEBUG] XGB validation: p-rmse: 0.993088\ta-peak@32: 1.000000\n", - "2024-09-05 13:53:17 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255319
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 319 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:53:18 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 319 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 451\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 451\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:53:18 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 13:54:13 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:54:24 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:54:27 [DEBUG] XGB validation: p-rmse: 0.995644\ta-peak@32: 1.000000\n", - "2024-09-05 13:54:27 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255383
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 383 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:54:27 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 383 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 515\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 515\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:54:27 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 13:55:23 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:55:37 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:55:40 [DEBUG] XGB validation: p-rmse: 0.996830\ta-peak@32: 1.000000\n", - "2024-09-05 13:55:40 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255447
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 447 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:55:40 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 447 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 579\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 579\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:55:40 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 13:56:36 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:56:47 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:56:50 [DEBUG] XGB validation: p-rmse: 0.992858\ta-peak@32: 1.000000\n", - "2024-09-05 13:56:51 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255511
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 511 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 13:56:51 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 511 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 643\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 643\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:56:51 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 13:57:45 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:57:54 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:57:57 [DEBUG] XGB validation: p-rmse: 0.992685\ta-peak@32: 1.000000\n", - "2024-09-05 13:57:57 [DEBUG] XGB iter 0: tr-p-rmse: 0.617149\ttr-a-peak@32: 0.996884\ttr-rmse: 0.785376\ttr-rmse: 0.785376\n", - "2024-09-05 13:57:58 [DEBUG] XGB iter 25: tr-p-rmse: 0.584664\ttr-a-peak@32: 0.843750\ttr-rmse: 0.784374\ttr-rmse: 0.784374\n", - "2024-09-05 13:57:58 [DEBUG] XGB iter 50: tr-p-rmse: 0.584664\ttr-a-peak@32: 0.843750\ttr-rmse: 0.784374\ttr-rmse: 0.784374\n", - "2024-09-05 13:57:58 [DEBUG] XGB stopped. Best iteration: [21] tr-p-rmse:0.58465\ttr-a-peak@32:0.84375\ttr-rmse:0.78438\ttr-rmse:0.78438 \n", - "2024-09-05 13:57:58 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255575
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 575 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 707\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:57:58 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 575 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 707\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:57:58 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 13:58:53 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 13:59:15 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 13:59:18 [DEBUG] XGB validation: p-rmse: 0.992945\ta-peak@32: 1.000000\n", - "2024-09-05 13:59:19 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255639
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 639 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 771\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:59:19 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 639 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 771\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 13:59:19 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:00:13 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:00:26 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:00:29 [DEBUG] XGB validation: p-rmse: 0.994188\ta-peak@32: 1.000000\n", - "2024-09-05 14:00:29 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255703
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 703 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:00:29 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 703 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 835\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 835\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:00:29 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #1: \"transpose\"\n", - "2024-09-05 14:00:31 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder\n", - "2024-09-05 14:00:31 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner\n", - "2024-09-05 14:00:31 [INFO] [task_scheduler.cc:237] [Updated] Task #1: \"transpose\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255703
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 703 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 835\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:00:31 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 703 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 835\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:00:31 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:01:26 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:01:41 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:01:44 [DEBUG] XGB validation: p-rmse: 0.990801\ta-peak@32: 1.000000\n", - "2024-09-05 14:01:44 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.421564
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255767
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 64 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 767 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 899\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:01:44 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 64 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 767 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 899\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:01:44 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"fused_matmul_add13\"\n", - "2024-09-05 14:01:58 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:02:07 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:02:10 [DEBUG] XGB validation: p-rmse: 0.941462\ta-peak@32: 1.000000\n", - "2024-09-05 14:02:10 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"fused_matmul_add13\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255767
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 767 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:02:10 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 767 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 963\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 963\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:02:10 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:03:02 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:03:24 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:03:27 [DEBUG] XGB validation: p-rmse: 0.997613\ta-peak@32: 1.000000\n", - "2024-09-05 14:03:27 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255831
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 831 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 1027\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:03:27 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 831 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1027\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:03:27 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:04:22 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:04:38 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:04:41 [DEBUG] XGB validation: p-rmse: 0.990378\ta-peak@32: 1.000000\n", - "2024-09-05 14:04:41 [DEBUG] XGB iter 0: tr-p-rmse: 0.700027\ttr-a-peak@32: 0.996884\ttr-rmse: 0.785376\ttr-rmse: 0.785376\n", - "2024-09-05 14:04:41 [DEBUG] XGB iter 25: tr-p-rmse: 0.681111\ttr-a-peak@32: 0.687500\ttr-rmse: 0.784374\ttr-rmse: 0.784374\n", - "2024-09-05 14:04:42 [DEBUG] XGB iter 50: tr-p-rmse: 0.681111\ttr-a-peak@32: 0.687500\ttr-rmse: 0.784374\ttr-rmse: 0.784374\n", - "2024-09-05 14:04:42 [DEBUG] XGB stopped. Best iteration: [21] tr-p-rmse:0.68109\ttr-a-peak@32:0.68750\ttr-rmse:0.78438\ttr-rmse:0.78438 \n", - "2024-09-05 14:04:42 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255895
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 895 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:04:42 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 895 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1091\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 1091\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:04:42 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:05:32 [INFO] [task_scheduler.cc:193] Sending 63 sample(s) to builder\n", - "2024-09-05 14:05:45 [INFO] [task_scheduler.cc:195] Sending 63 sample(s) to runner\n", - "2024-09-05 14:05:48 [DEBUG] XGB validation: p-rmse: 0.999027\ta-peak@32: 1.000000\n", - "2024-09-05 14:05:49 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.1255958
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 958 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 1154\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:05:49 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 958 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1154\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:05:49 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:06:38 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:06:50 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:06:53 [DEBUG] XGB validation: p-rmse: 0.991482\ta-peak@32: 1.000000\n", - "2024-09-05 14:06:53 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551022
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1022 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:06:53 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1022 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1218\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 1218\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:06:53 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:07:43 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:07:55 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:07:58 [DEBUG] XGB validation: p-rmse: 0.988367\ta-peak@32: 1.000000\n", - "2024-09-05 14:07:59 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551086
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1086 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:07:59 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1086 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1282\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 1282\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:07:59 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:08:49 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:09:06 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:09:09 [DEBUG] XGB validation: p-rmse: 0.991758\ta-peak@32: 1.000000\n", - "2024-09-05 14:09:09 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551150
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1150 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:09:09 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1150 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1346\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 1346\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:09:09 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:10:00 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:10:13 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:10:16 [DEBUG] XGB validation: p-rmse: 0.988061\ta-peak@32: 1.000000\n", - "2024-09-05 14:10:16 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551214
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1214 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:10:16 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1214 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1410\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 1410\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:10:16 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:11:10 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:11:20 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:11:23 [DEBUG] XGB validation: p-rmse: 0.991761\ta-peak@32: 1.000000\n", - "2024-09-05 14:11:23 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551278
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1278 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:11:23 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1278 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1474\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 1474\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:11:23 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:12:17 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:12:34 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:12:37 [DEBUG] XGB validation: p-rmse: 0.993003\ta-peak@32: 1.000000\n", - "2024-09-05 14:12:37 [DEBUG] XGB iter 0: tr-p-rmse: 0.765134\ttr-a-peak@32: 0.996597\ttr-rmse: 0.785376\ttr-rmse: 0.785376\n", - "2024-09-05 14:12:37 [DEBUG] XGB iter 25: tr-p-rmse: 0.749778\ttr-a-peak@32: 0.562500\ttr-rmse: 0.784374\ttr-rmse: 0.784374\n", - "2024-09-05 14:12:37 [DEBUG] XGB iter 50: tr-p-rmse: 0.749778\ttr-a-peak@32: 0.562500\ttr-rmse: 0.784374\ttr-rmse: 0.784374\n", - "2024-09-05 14:12:37 [DEBUG] XGB stopped. Best iteration: [21] tr-p-rmse:0.74976\ttr-a-peak@32:0.56250\ttr-rmse:0.78438\ttr-rmse:0.78438 \n", - "2024-09-05 14:12:38 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551342
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1342 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:12:38 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1342 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1538\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 1538\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:12:38 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:13:31 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:13:44 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:13:47 [DEBUG] XGB validation: p-rmse: 0.988875\ta-peak@32: 1.000000\n", - "2024-09-05 14:13:47 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551406
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1406 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:13:47 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1406 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1602\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 1602\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:13:47 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #1: \"transpose\"\n", - "2024-09-05 14:13:49 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder\n", - "2024-09-05 14:13:49 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner\n", - "2024-09-05 14:13:49 [INFO] [task_scheduler.cc:237] [Updated] Task #1: \"transpose\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551406
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1406 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 1602\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:13:49 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1406 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1602\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:13:49 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:14:42 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:14:58 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:15:01 [DEBUG] XGB validation: p-rmse: 0.997105\ta-peak@32: 1.000000\n", - "2024-09-05 14:15:01 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551470
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1470 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:15:01 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1470 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1666\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 1666\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:15:01 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:15:52 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:16:05 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:16:08 [DEBUG] XGB validation: p-rmse: 0.993584\ta-peak@32: 1.000000\n", - "2024-09-05 14:16:08 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215128
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551534
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 128 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1534 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:16:08 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 128 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1534 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1730\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 1730\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:16:08 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"fused_matmul_add13\"\n", - "2024-09-05 14:16:21 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:16:32 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:16:35 [DEBUG] XGB validation: p-rmse: 0.917834\ta-peak@32: 1.000000\n", - "2024-09-05 14:16:35 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"fused_matmul_add13\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551534
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1534 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 1794\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:16:35 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1534 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1794\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:16:35 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:17:26 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:17:37 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:17:40 [DEBUG] XGB validation: p-rmse: 0.997021\ta-peak@32: 1.000000\n", - "2024-09-05 14:17:40 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551598
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1598 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 1858\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:17:40 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1598 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1858\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:17:40 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:18:31 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:18:47 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:18:50 [DEBUG] XGB validation: p-rmse: 0.989833\ta-peak@32: 1.000000\n", - "2024-09-05 14:18:50 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551662
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1662 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:18:50 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1662 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1922\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 1922\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:18:50 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:19:43 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:19:58 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:20:01 [DEBUG] XGB validation: p-rmse: 0.998325\ta-peak@32: 1.000000\n", - "2024-09-05 14:20:01 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551726
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1726 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:20:01 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1726 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 1986\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 1986\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:20:01 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:20:52 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:21:09 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:21:12 [DEBUG] XGB validation: p-rmse: 0.993594\ta-peak@32: 1.000000\n", - "2024-09-05 14:21:12 [DEBUG] XGB iter 0: tr-p-rmse: 0.806481\ttr-a-peak@32: 0.996884\ttr-rmse: 0.785376\ttr-rmse: 0.785376\n", - "2024-09-05 14:21:13 [DEBUG] XGB iter 25: tr-p-rmse: 0.796109\ttr-a-peak@32: 0.062500\ttr-rmse: 0.784374\ttr-rmse: 0.784374\n", - "2024-09-05 14:21:13 [DEBUG] XGB iter 50: tr-p-rmse: 0.796109\ttr-a-peak@32: 0.062500\ttr-rmse: 0.784374\ttr-rmse: 0.784374\n", - "2024-09-05 14:21:13 [DEBUG] XGB stopped. Best iteration: [21] tr-p-rmse:0.79609\ttr-a-peak@32:0.06250\ttr-rmse:0.78438\ttr-rmse:0.78438 \n", - "2024-09-05 14:21:13 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551790
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1790 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:21:13 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1790 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2050\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2050\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:21:13 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:22:09 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:22:42 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:22:45 [DEBUG] XGB validation: p-rmse: 0.992930\ta-peak@32: 1.000000\n", - "2024-09-05 14:22:46 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551854
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1854 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:22:46 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1854 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2114\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2114\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:22:46 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:23:41 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:24:09 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:24:12 [DEBUG] XGB validation: p-rmse: 0.993994\ta-peak@32: 1.000000\n", - "2024-09-05 14:24:12 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551918
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1918 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:24:12 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1918 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2178\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2178\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:24:12 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:25:03 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:25:16 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:25:19 [DEBUG] XGB validation: p-rmse: 0.993339\ta-peak@32: 1.000000\n", - "2024-09-05 14:25:19 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12551982
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 1982 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:25:19 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 1982 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2242\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2242\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:25:19 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:26:11 [INFO] [task_scheduler.cc:193] Sending 63 sample(s) to builder\n", - "2024-09-05 14:26:26 [INFO] [task_scheduler.cc:195] Sending 63 sample(s) to runner\n", - "2024-09-05 14:26:28 [DEBUG] XGB validation: p-rmse: 0.995799\ta-peak@32: 1.000000\n", - "2024-09-05 14:26:29 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552045
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2045 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:26:29 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2045 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2305\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2305\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:26:29 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:27:22 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:27:47 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:27:49 [DEBUG] XGB validation: p-rmse: 0.994259\ta-peak@32: 1.000000\n", - "2024-09-05 14:27:50 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552109
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2109 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:27:50 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2109 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2369\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2369\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:27:50 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #1: \"transpose\"\n", - "2024-09-05 14:27:52 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder\n", - "2024-09-05 14:27:52 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner\n", - "2024-09-05 14:27:52 [INFO] [task_scheduler.cc:237] [Updated] Task #1: \"transpose\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552109
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2109 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 2369\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:27:52 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2109 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2369\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:27:52 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:28:51 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:29:05 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:29:08 [DEBUG] XGB validation: p-rmse: 0.991865\ta-peak@32: 1.000000\n", - "2024-09-05 14:29:08 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552173
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2173 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:29:08 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2173 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2433\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2433\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:29:08 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:30:00 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:30:11 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:30:14 [DEBUG] XGB validation: p-rmse: 0.990148\ta-peak@32: 1.000000\n", - "2024-09-05 14:30:14 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552237
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2237 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:30:14 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2237 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2497\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2497\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:30:14 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #2: \"reshape\"\n", - "2024-09-05 14:30:15 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder\n", - "2024-09-05 14:30:15 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner\n", - "2024-09-05 14:30:15 [INFO] [task_scheduler.cc:237] [Updated] Task #2: \"reshape\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552237
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2237 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:30:15 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2237 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2497\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2497\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:30:15 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:31:07 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:31:42 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:31:46 [DEBUG] XGB validation: p-rmse: 0.991221\ta-peak@32: 1.000000\n", - "2024-09-05 14:31:47 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215192
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552301
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 192 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2301 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:31:47 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 192 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2301 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2561\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2561\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:31:47 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"fused_matmul_add13\"\n", - "2024-09-05 14:32:00 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:32:11 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:32:14 [DEBUG] XGB validation: p-rmse: 0.908812\ta-peak@32: 1.000000\n", - "2024-09-05 14:32:14 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"fused_matmul_add13\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552301
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2301 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:32:14 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2301 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2625\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2625\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:32:14 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:33:05 [INFO] [task_scheduler.cc:193] Sending 63 sample(s) to builder\n", - "2024-09-05 14:33:25 [INFO] [task_scheduler.cc:195] Sending 63 sample(s) to runner\n", - "2024-09-05 14:33:28 [DEBUG] XGB validation: p-rmse: 0.992482\ta-peak@32: 1.000000\n", - "2024-09-05 14:33:28 [DEBUG] XGB iter 0: tr-p-rmse: 0.840748\ttr-a-peak@32: 0.996884\ttr-rmse: 0.785376\ttr-rmse: 0.785376\n", - "2024-09-05 14:33:28 [DEBUG] XGB iter 25: tr-p-rmse: 0.805981\ttr-a-peak@32: 0.843750\ttr-rmse: 0.784370\ttr-rmse: 0.784370\n", - "2024-09-05 14:33:29 [DEBUG] XGB iter 50: tr-p-rmse: 0.805981\ttr-a-peak@32: 0.843750\ttr-rmse: 0.784370\ttr-rmse: 0.784370\n", - "2024-09-05 14:33:29 [DEBUG] XGB stopped. Best iteration: [22] tr-p-rmse:0.80596\ttr-a-peak@32:0.84375\ttr-rmse:0.78438\ttr-rmse:0.78438 \n", - "2024-09-05 14:33:29 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552364
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2364 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:33:29 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2364 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2688\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2688\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:33:29 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:34:26 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:34:40 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:34:43 [DEBUG] XGB validation: p-rmse: 1.000245\ta-peak@32: 1.000000\n", - "2024-09-05 14:34:44 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552428
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2428 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:34:44 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2428 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2752\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2752\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:34:44 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:35:33 [INFO] [task_scheduler.cc:193] Sending 63 sample(s) to builder\n", - "2024-09-05 14:35:47 [INFO] [task_scheduler.cc:195] Sending 63 sample(s) to runner\n", - "2024-09-05 14:35:50 [DEBUG] XGB validation: p-rmse: 0.993923\ta-peak@32: 1.000000\n", - "2024-09-05 14:35:50 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552491
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2491 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:35:50 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2491 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2815\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2815\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:35:50 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:36:46 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:37:03 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:37:06 [DEBUG] XGB validation: p-rmse: 0.994393\ta-peak@32: 1.000000\n", - "2024-09-05 14:37:06 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552555
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2555 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:37:06 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2555 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2879\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 2879\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:37:06 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:38:01 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:38:15 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:38:18 [DEBUG] XGB validation: p-rmse: 0.998297\ta-peak@32: 1.000000\n", - "2024-09-05 14:38:18 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552619
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2619 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 2943\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:38:18 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2619 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 2943\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:38:18 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:39:13 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:39:23 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:39:26 [DEBUG] XGB validation: p-rmse: 0.995672\ta-peak@32: 1.000000\n", - "2024-09-05 14:39:26 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552683
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2683 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:39:26 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2683 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3007\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3007\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:39:26 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:40:21 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:40:41 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:40:44 [DEBUG] XGB validation: p-rmse: 0.989053\ta-peak@32: 1.000000\n", - "2024-09-05 14:40:44 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552747
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2747 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:40:44 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2747 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3071\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3071\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:40:44 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:41:36 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:41:49 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:41:51 [DEBUG] XGB validation: p-rmse: 0.994911\ta-peak@32: 1.000000\n", - "2024-09-05 14:41:52 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552811
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2811 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:41:52 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2811 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3135\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3135\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:41:52 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #1: \"transpose\"\n", - "2024-09-05 14:41:54 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder\n", - "2024-09-05 14:41:54 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner\n", - "2024-09-05 14:41:54 [INFO] [task_scheduler.cc:237] [Updated] Task #1: \"transpose\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552811
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2811 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 3135\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:41:54 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2811 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3135\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:41:54 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:42:46 [INFO] [task_scheduler.cc:193] Sending 63 sample(s) to builder\n", - "2024-09-05 14:43:09 [INFO] [task_scheduler.cc:195] Sending 63 sample(s) to runner\n", - "2024-09-05 14:43:12 [DEBUG] XGB validation: p-rmse: 0.997728\ta-peak@32: 1.000000\n", - "2024-09-05 14:43:12 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552874
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2874 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:43:12 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2874 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3198\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3198\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:43:12 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:44:09 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:44:30 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:44:33 [DEBUG] XGB validation: p-rmse: 0.988110\ta-peak@32: 1.000000\n", - "2024-09-05 14:44:33 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12552938
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 2938 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:44:33 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 2938 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3262\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3262\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:44:33 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:45:27 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:45:41 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:45:44 [DEBUG] XGB validation: p-rmse: 0.989306\ta-peak@32: 1.000000\n", - "2024-09-05 14:45:44 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553002
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 3002 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:45:44 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3002 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3326\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3326\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:45:44 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:46:37 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:46:51 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:46:54 [DEBUG] XGB validation: p-rmse: 0.990028\ta-peak@32: 1.000000\n", - "2024-09-05 14:46:54 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215256
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553066
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 256 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 3066 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 3390\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:46:54 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 256 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3066 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3390\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:46:54 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: \"fused_matmul_add13\"\n", - "2024-09-05 14:47:06 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:47:15 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:47:18 [DEBUG] XGB validation: p-rmse: 0.863139\ta-peak@32: 1.000000\n", - "2024-09-05 14:47:18 [DEBUG] XGB iter 0: tr-p-rmse: 0.868177\ttr-a-peak@32: 0.996884\ttr-rmse: 0.785376\ttr-rmse: 0.785376\n", - "2024-09-05 14:47:18 [DEBUG] XGB iter 25: tr-p-rmse: 0.845161\ttr-a-peak@32: 0.250000\ttr-rmse: 0.784373\ttr-rmse: 0.784373\n", - "2024-09-05 14:47:18 [DEBUG] XGB iter 50: tr-p-rmse: 0.845161\ttr-a-peak@32: 0.250000\ttr-rmse: 0.784373\ttr-rmse: 0.784373\n", - "2024-09-05 14:47:18 [DEBUG] XGB stopped. Best iteration: [22] tr-p-rmse:0.84515\ttr-a-peak@32:0.25000\ttr-rmse:0.78438\ttr-rmse:0.78438 \n", - "2024-09-05 14:47:18 [INFO] [task_scheduler.cc:237] [Updated] Task #0: \"fused_matmul_add13\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553066
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 3066 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:47:18 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3066 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3454\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3454\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:47:18 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:48:13 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:48:27 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:48:29 [DEBUG] XGB validation: p-rmse: 0.990602\ta-peak@32: 1.000000\n", - "2024-09-05 14:48:30 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553130
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 3130 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:48:30 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3130 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3518\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3518\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:48:30 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:49:21 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:49:36 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:49:39 [DEBUG] XGB validation: p-rmse: 0.989539\ta-peak@32: 1.000000\n", - "2024-09-05 14:49:39 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553194
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 3194 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:49:39 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3194 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3582\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3582\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:49:39 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:50:32 [INFO] [task_scheduler.cc:193] Sending 63 sample(s) to builder\n", - "2024-09-05 14:50:50 [INFO] [task_scheduler.cc:195] Sending 63 sample(s) to runner\n", - "2024-09-05 14:50:53 [DEBUG] XGB validation: p-rmse: 0.997316\ta-peak@32: 1.000000\n", - "2024-09-05 14:50:53 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.209962
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553257
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 62 \n", - "4 112.1255 3257 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:50:53 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 62 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3257 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3645\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3645\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:50:53 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #3: \"adaptive_avg_pool2d\"\n", - "2024-09-05 14:50:57 [INFO] [task_scheduler.cc:193] Sending 62 sample(s) to builder\n", - "2024-09-05 14:51:04 [INFO] [task_scheduler.cc:195] Sending 62 sample(s) to runner\n", - "2024-09-05 14:51:06 [DEBUG] XGB validation: p-rmse: 0.662898\ta-peak@32: 1.000000\n", - "2024-09-05 14:51:06 [INFO] [task_scheduler.cc:237] [Updated] Task #3: \"adaptive_avg_pool2d\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.2099124
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553257
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 124 \n", - "4 112.1255 3257 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:51:06 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 124 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3257 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3707\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3707\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:51:06 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:51:59 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:52:15 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:52:17 [DEBUG] XGB validation: p-rmse: 0.991711\ta-peak@32: 1.000000\n", - "2024-09-05 14:52:18 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.2099124
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553321
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 124 \n", - "4 112.1255 3321 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:52:18 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 124 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3321 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3771\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3771\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:52:18 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:53:10 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:53:40 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:53:43 [DEBUG] XGB validation: p-rmse: 0.992641\ta-peak@32: 1.000000\n", - "2024-09-05 14:53:44 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.2099124
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553385
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 124 \n", - "4 112.1255 3385 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:53:44 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 124 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3385 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3835\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3835\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:53:44 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:54:39 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:54:50 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:54:54 [DEBUG] XGB validation: p-rmse: 0.998569\ta-peak@32: 1.000000\n", - "2024-09-05 14:54:55 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.2099124
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553449
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 124 \n", - "4 112.1255 3449 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Total trials: 3899\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:54:55 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 124 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3449 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3899\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:54:55 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:55:49 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:56:24 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:56:27 [DEBUG] XGB validation: p-rmse: 0.992133\ta-peak@32: 1.000000\n", - "2024-09-05 14:56:27 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.2099124
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553513
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 \n", - "2 3.2698 5 \n", - "3 2.2099 124 \n", - "4 112.1255 3513 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:56:27 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 124 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3513 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3963\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3963\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:56:27 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #1: \"transpose\"\n", - "2024-09-05 14:56:29 [INFO] [task_scheduler.cc:260] Task #1 has finished. Remaining task(s): 19\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881Y
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.2099124
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553513
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 Y \n", - "2 3.2698 5 \n", - "3 2.2099 124 \n", - "4 112.1255 3513 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:56:29 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | Y \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 124 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3513 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 3963\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 3963\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:56:29 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:57:25 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:57:38 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:57:42 [DEBUG] XGB validation: p-rmse: 0.994033\ta-peak@32: 1.000000\n", - "2024-09-05 14:57:43 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881Y
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.2099124
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553577
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 Y \n", - "2 3.2698 5 \n", - "3 2.2099 124 \n", - "4 112.1255 3577 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:57:43 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | Y \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 124 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3577 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 4027\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 4027\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:57:43 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:58:35 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 14:58:59 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 14:59:01 [DEBUG] XGB validation: p-rmse: 0.989477\ta-peak@32: 1.000000\n", - "2024-09-05 14:59:02 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881Y
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.2099124
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553641
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 Y \n", - "2 3.2698 5 \n", - "3 2.2099 124 \n", - "4 112.1255 3641 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 14:59:02 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | Y \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 124 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3641 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 4091\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 4091\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 14:59:02 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n", - "2024-09-05 14:59:55 [INFO] [task_scheduler.cc:193] Sending 64 sample(s) to builder\n", - "2024-09-05 15:00:07 [INFO] [task_scheduler.cc:195] Sending 64 sample(s) to runner\n", - "2024-09-05 15:00:10 [DEBUG] XGB validation: p-rmse: 0.992431\ta-peak@32: 1.000000\n", - "2024-09-05 15:00:10 [INFO] [task_scheduler.cc:237] [Updated] Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameFLOPWeightSpeed (GFLOPS)Latency (us)Weighted Latency (us)TrialsDone
0fused_matmul_add1310250001108.79369.42159.4215320
1transpose110.000110.308810.30881Y
2reshape110.00033.26983.26985
3adaptive_avg_pool2d25600111.58402.20992.2099124
4fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu423133644812063.1924112.1255112.12553705
5fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu2400419841N/AN/AN/A64
6fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu12322145282N/AN/AN/A64
7fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu41157309441N/AN/AN/A64
8fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu42313615362N/AN/AN/A64
9fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu22317127681N/AN/AN/A64
10fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11129454081N/AN/AN/A63
11max_pool2d18063361N/AN/AN/A62
12fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu32315120642N/AN/AN/A64
13fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5132464641N/AN/AN/A64
14fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu12324152322N/AN/AN/A64
15fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu21161072641N/AN/AN/A64
16fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu31158563841N/AN/AN/A63
17fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8130457601N/AN/AN/A64
18fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu22318131202N/AN/AN/A64
19fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu32314618881N/AN/AN/A64
\n", - "
" - ], - "text/plain": [ - " Name \\\n", - "0 fused_matmul_add13 \n", - "1 transpose \n", - "2 reshape \n", - "3 adaptive_avg_pool2d \n", - "4 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "5 fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu \n", - "6 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 \n", - "7 fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 \n", - "8 fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 \n", - "9 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "10 fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 \n", - "11 max_pool2d \n", - "12 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 \n", - "13 fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 \n", - "14 fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 \n", - "15 fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 \n", - "16 fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "17 fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 \n", - "18 fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 \n", - "19 fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 \n", - "\n", - " FLOP Weight Speed (GFLOPS) Latency (us) \\\n", - "0 1025000 1 108.7936 9.4215 \n", - "1 1 1 0.0001 10.3088 \n", - "2 1 1 0.0003 3.2698 \n", - "3 25600 1 11.5840 2.2099 \n", - "4 231336448 1 2063.1924 112.1255 \n", - "5 240041984 1 N/A N/A \n", - "6 232214528 2 N/A N/A \n", - "7 115730944 1 N/A N/A \n", - "8 231361536 2 N/A N/A \n", - "9 231712768 1 N/A N/A \n", - "10 12945408 1 N/A N/A \n", - "11 1806336 1 N/A N/A \n", - "12 231512064 2 N/A N/A \n", - "13 13246464 1 N/A N/A \n", - "14 232415232 2 N/A N/A \n", - "15 116107264 1 N/A N/A \n", - "16 115856384 1 N/A N/A \n", - "17 13045760 1 N/A N/A \n", - "18 231813120 2 N/A N/A \n", - "19 231461888 1 N/A N/A \n", - "\n", - " Weighted Latency (us) Trials Done \n", - "0 9.4215 320 \n", - "1 10.3088 1 Y \n", - "2 3.2698 5 \n", - "3 2.2099 124 \n", - "4 112.1255 3705 \n", - "5 N/A 64 \n", - "6 N/A 64 \n", - "7 N/A 64 \n", - "8 N/A 64 \n", - "9 N/A 64 \n", - "10 N/A 63 \n", - "11 N/A 62 \n", - "12 N/A 64 \n", - "13 N/A 64 \n", - "14 N/A 64 \n", - "15 N/A 64 \n", - "16 N/A 63 \n", - "17 N/A 64 \n", - "18 N/A 64 \n", - "19 N/A 64 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-09-05 15:00:10 [DEBUG] [task_scheduler.cc:318] \n", - " ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - " 0 | fused_matmul_add13 | 1025000 | 1 | 108.7936 | 9.4215 | 9.4215 | 320 | \n", - " 1 | transpose | 1 | 1 | 0.0001 | 10.3088 | 10.3088 | 1 | Y \n", - " 2 | reshape | 1 | 1 | 0.0003 | 3.2698 | 3.2698 | 5 | \n", - " 3 | adaptive_avg_pool2d | 25600 | 1 | 11.5840 | 2.2099 | 2.2099 | 124 | \n", - " 4 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 231336448 | 1 | 2063.1924 | 112.1255 | 112.1255 | 3705 | \n", - " 5 | fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu | 240041984 | 1 | N/A | N/A | N/A | 64 | \n", - " 6 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1 | 232214528 | 2 | N/A | N/A | N/A | 64 | \n", - " 7 | fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4 | 115730944 | 1 | N/A | N/A | N/A | 64 | \n", - " 8 | fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4 | 231361536 | 2 | N/A | N/A | N/A | 64 | \n", - " 9 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 231712768 | 1 | N/A | N/A | N/A | 64 | \n", - " 10 | fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11 | 12945408 | 1 | N/A | N/A | N/A | 63 | \n", - " 11 | max_pool2d | 1806336 | 1 | N/A | N/A | N/A | 62 | \n", - " 12 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_add9_relu3 | 231512064 | 2 | N/A | N/A | N/A | 64 | \n", - " 13 | fused_conv2d4_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5 | 13246464 | 1 | N/A | N/A | N/A | 64 | \n", - " 14 | fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1 | 232415232 | 2 | N/A | N/A | N/A | 64 | \n", - " 15 | fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2 | 116107264 | 1 | N/A | N/A | N/A | 64 | \n", - " 16 | fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 115856384 | 1 | N/A | N/A | N/A | 63 | \n", - " 17 | fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8 | 13045760 | 1 | N/A | N/A | N/A | 64 | \n", - " 18 | fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2 | 231813120 | 2 | N/A | N/A | N/A | 64 | \n", - " 19 | fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3 | 231461888 | 1 | N/A | N/A | N/A | 64 | \n", - "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", - "Total trials: 4155\n", - "Total latency (us): 137.336\n", - "\n", - "\n", - "Total trials: 4155\n", - "Total latency (us): 137.336\n", - "\n", - "2024-09-05 15:00:10 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #4: \"fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4\"\n" - ] - } - ], + "execution_count": null, + "metadata": { + "tags": [ + "hidden-output" + ] + }, + "outputs": [], "source": [ "TOTAL_TRIALS = 8000 # Change to 20000 for better performance if needed\n", "target = tvm.target.Target(\"nvidia/geforce-rtx-3090-ti\") # Change to your target device\n", diff --git a/src/tvm_book/__init__.py b/src/tvm_book/__init__.py index a90f69f6..4a411aa3 100644 --- a/src/tvm_book/__init__.py +++ b/src/tvm_book/__init__.py @@ -1 +1,3 @@ """TVM: Open Deep Learning Compiler Stack.""" + +__version__ = '0.1'