From 89cee339d10fdc091c6109688c94e96c84803f67 Mon Sep 17 00:00:00 2001 From: Geoffrey Blake Date: Wed, 21 Mar 2018 15:41:56 -0500 Subject: [PATCH 1/4] code/intel: Modify code to work better with automated testing environments Update Intel code to work better in automated testing environments by allowing one to specify on the command line the type of matrix to test and additionally print output in a machine readable CSV format. We also provide some basic bug fixes to the code as well. Signed-off-by: Geoffrey Blake (Geoffrey.Blake@arm.com) --- code/intel/convolution/mkl_conv/Makefile | 8 +- .../convolution/mkl_conv/std_conv_bench.cpp | 272 ++++++++++++++---- code/intel/gemm/Makefile | 6 +- code/intel/gemm/bench.cpp | 139 +++++++-- code/intel/gemm/run_mkl_igemm_ia.sh | 2 +- code/intel/gemm/run_mkl_sgemm_ia.sh | 4 +- code/intel/spmm/Makefile | 4 +- 7 files changed, 343 insertions(+), 92 deletions(-) diff --git a/code/intel/convolution/mkl_conv/Makefile b/code/intel/convolution/mkl_conv/Makefile index f6117be..6f9f0e2 100644 --- a/code/intel/convolution/mkl_conv/Makefile +++ b/code/intel/convolution/mkl_conv/Makefile @@ -30,10 +30,10 @@ ifeq ($(MKLLIB), mklml_intel) EXTRALIB = -L$(MKLROOT)/lib -lmklml_intel endif ifeq ($(MKLLIB), mkl_rt) - EXTRALIB = -L$(MKLROOT)/lib/intel64/ -lmkl_rt \ + EXTRALIB = -L$(MKLROOT)/lib -lmkl_rt \ -Wl,-rpath,$(MKLROOT)/lib/intel64 endif -EXTRALIB += -liomp5 -lpthread -lm -ldl +EXTRALIB += -L$(MKLROOT)/lib -liomp5 -lpthread -lm -ldl endif ifeq ($(CONVLIB),MKLDNN) @@ -43,7 +43,7 @@ ifeq ($(MKLDNNROOT),) to the install directory.) endif EXTRACXXFLAGS = -I$(MKLDNNROOT)/include -DUSE_MKLDNN -EXTRALIB = -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,$(MKLDNNROOT)/lib +EXTRALIB = -L$(MKLDNNROOT)/lib -lmkldnn -lmklml_intel -Wl,-rpath,$(MKLDNNROOT)/lib endif ifeq ($(DEBUG), 1) @@ -53,7 +53,7 @@ OPTFLAGS = -O3 endif CXX = icpc -CXXFLAGS = -Wall -std=c++11 $(OPTFLAGS) $(EXTRACXXFLAGS) -fopenmp -I../../../kernels +CXXFLAGS = -Wall -std=c++11 $(OPTFLAGS) $(EXTRACXXFLAGS) -fopenmp -I../../../kernelss LFLAGS = -lrt OBJS = std_conv_bench.o diff --git a/code/intel/convolution/mkl_conv/std_conv_bench.cpp b/code/intel/convolution/mkl_conv/std_conv_bench.cpp index 3860892..fd2229a 100644 --- a/code/intel/convolution/mkl_conv/std_conv_bench.cpp +++ b/code/intel/convolution/mkl_conv/std_conv_bench.cpp @@ -18,11 +18,13 @@ #include #include #include +#include #include #include #include #include +#include struct conv_problem { int minibatch; @@ -51,8 +53,6 @@ struct conv_problem { #define INFERENCE_SERVER 1 #define INFERENCE_DEVICE 2 -#define ITERS 1000 - // Calculates convolution output dimension using the definition from Caffe static inline int calc_out_dim( int input_dim, int filter_dim, int padd, int stride) @@ -352,10 +352,6 @@ static void usage() printf( "Usage: [OPTIONS]\n" "\n" - "Output control:\n" - " --csv-output Produce CSV output\n" - " --original-output Produce output in the original format\n" - "\n" "Control flops calculations:\n" " --no-skip-padding Count ops with padding zeroes (default)\n" " --skip-padding Do not count ops with padding zeroes\n" @@ -367,8 +363,21 @@ static void usage() " (AVX512_4VNNI CPUs)\n" "Problem set control:\n" " --training Training data set (default)\n" - " --inference-server Server inference data set\n" - " --inference-device Device inference data set\n" + " --inference Server inference data set\n" + " --device Device inference data set\n" + "Custom convolution definition:\n" + " --w Width\n" + " --h Height\n" + " --c \n" + " --n \n", + " --k \n", + " --filter_w \n", + " --filter_h \n", + " --pad_w \n", + " --pad_h \n", + " --wstride \n", + " --hstride \n", + " --repeat Number of times to test convolution (default: 50)\n", "\n" ); exit(-1); @@ -377,36 +386,185 @@ static void usage() int main(int argc, char **argv) { bool skip_padding = false; - bool csv_output = false; int precision = PREC_F32; - std::vector modes - = {FWD_CONVOLUTION, BWD_F_CONVOLUTION, BWD_D_CONVOLUTION}; + std::vector modes = {FWD_CONVOLUTION}; int problem_set = TRAINING; - - for(argc--, argv++; argc; argv++, argc--) { - if (*argv == std::string("--csv-output")) - csv_output = true; - else if (*argv == std::string("--original-output")) - csv_output = false; - else if (*argv == std::string("--skip-padding")) - skip_padding = true; - else if (*argv == std::string("--no-skip-padding")) - skip_padding = false; - else if (*argv == std::string("--f32")) - precision = PREC_F32; - else if (*argv == std::string("--u8s8u8")) - precision = PREC_U8S8U8; - else if (*argv == std::string("--s16s16s32")) - precision = PREC_S16S16S32; - else if (*argv == std::string("--inference-device")) - problem_set = INFERENCE_DEVICE; - else if (*argv == std::string("--inference-server")) - problem_set = INFERENCE_SERVER; - else if (*argv == std::string("--training")) - problem_set = TRAINING; - else - usage(); - } + // DEFAULTS + int ITERS = 50; + std::vector > *problems = nullptr; + unsigned int w, h, c, n, k, filter_w, filter_h, pad_w, pad_h, wstride, hstride; + w = 151; h = 40; c = 1; n = 1; k = 32; filter_w = 20; + filter_h = 5; pad_w = 8; pad_h = 8; wstride = 8; hstride = 2; + + // Use getopt_long here to allow for either driving the benchmark using + // built in tests, or make it a gemm tester + static struct option long_options[] = { + {"training", no_argument, 0, 0}, // These will run the full tests and override customization + {"inference", no_argument, 0, 0}, + {"device", no_argument, 0, 0}, + {"repeat", required_argument, 0, 0}, + {"w", required_argument, 0, 0}, + {"h", required_argument, 0, 0}, + {"c", required_argument, 0, 0}, + {"n", required_argument, 0, 0}, + {"k", required_argument, 0, 0}, + {"filter_w", required_argument, 0, 0}, + {"filter_h", required_argument, 0, 0}, + {"pad_w", required_argument, 0, 0}, + {"pad_h", required_argument, 0, 0}, + {"wstride", required_argument, 0, 0}, + {"hstride", required_argument, 0, 0}, + {"no-skip-padding", no_argument, 0, 0}, + {"skip-padding", no_argument, 0, 0}, + {"f32", no_argument, 0, 0}, + {"u8s8u8", no_argument, 0, 0}, + {"s16s16s32", no_argument, 0, 0}, + {0, 0, 0, 0} + }; + + int opt; + do { + int option_index = 0; + opt = getopt_long(argc, argv, "", long_options, &option_index); + switch (opt) { + case -1: + break; + case 0: + switch (option_index) { + case 0: + if (problems == nullptr) { + problems = &training_set; + modes = {FWD_CONVOLUTION, BWD_F_CONVOLUTION, BWD_D_CONVOLUTION}; + std::cout << "Running the training benchmark set" << std::endl; + } + break; + case 1: + if (problems == nullptr) { + problems = &inference_server_set; + std::cout << "Running the inference server set" << std::endl; + } + break; + case 2: + if (problems == nullptr) { + problems = &inference_device_set; + std::cout << "Running the inference device set" << std::endl; + } + break; + case 3: + ITERS = std::atoi(optarg); + if (ITERS <= 0) { + std::cerr << "Invalid repeat parameter spec'ed" << std::endl; + return 0; + } + break; + case 4: + w = std::atoi(optarg); + if (w <= 0) { + std::cerr << "Invalid w parameter spec'ed" << std::endl; + return 0; + } + break; + case 5: + h = std::atoi(optarg); + if (h <= 0) { + std::cerr << "Invalid h parameter spec'ed" << std::endl; + return 0; + } + break; + case 6: + c = std::atoi(optarg); + if (c <= 0) { + std::cerr << "Invalid c parameter spec'ed" << std::endl; + return 0; + } + break; + case 7: + n = std::atoi(optarg); + if (n <= 0) { + std::cerr << "Invalid n parameter spec'ed" << std::endl; + return 0; + } + break; + case 8: + k = std::atoi(optarg); + if (k <= 0) { + std::cerr << "Invalid k parameter spec'ed" << std::endl; + return 0; + } + break; + case 9: + filter_w = std::atoi(optarg); + if (filter_w <= 0) { + std::cerr << "Invalid filter_w paramter spec'ed" << std::endl; + return 0; + } + break; + case 10: + filter_h = std::atoi(optarg); + if (filter_h <= 0) { + std::cerr << "Invalid filter_h parameter spec'ed" << std::endl; + return 0; + } + break; + case 11: + pad_w = std::atoi(optarg); + if (pad_w < 0) { + std::cerr << "Invalid pad_w parameter spec'ed" << std::endl; + return 0; + } + break; + case 12: + pad_h = std::atoi(optarg); + if (pad_h < 0) { + std::cerr << "Invalid pad_h parameter spec'ed" << std::endl; + return 0; + } + break; + case 13: + wstride = std::atoi(optarg); + if (wstride <= 0) { + std::cerr << "Invalid wstride parameter spec'ed" << std::endl; + return 0; + } + break; + case 14: + hstride = std::atoi(optarg); + if (hstride <= 0) { + std::cerr << "Invalid hstride parameter spec'ed" << std::endl; + return 0; + } + break; + case 15: + skip_padding = false; + break; + case 16: + skip_padding = true; + break; + case 17: + precision = PREC_F32; + break; + case 18: + precision = PREC_U8S8U8; + break; + case 19: + precision = PREC_S16S16S32; + break; + default: + break; + } + break; + case '?': + usage(); + return 0; + break; + default: + usage(); + return 0; + break; + } + } while (opt != -1); #ifdef USE_MKL if (precision != PREC_F32) { @@ -416,43 +574,31 @@ int main(int argc, char **argv) } #endif -#ifdef USE_MKLDNN - if (precision != PREC_F32 || problem_set != TRAINING) - modes = {FWD_CONVOLUTION}; -#endif + if (problems == nullptr) { + problems = new std::vector >(); + problems->push_back(std::tuple(w, h, c, n, k, filter_w, + filter_h, pad_w, pad_h, wstride, hstride)); + } const char *conv_mode_strs[] = {"FWD", "BWD_F", "BWD_D"}; const char *skip_padding_strs[] - = {"w/ padding in flops", "w/o padding in flops"}; - - const auto &problems = (problem_set == TRAINING - ? training_set - : (problem_set == INFERENCE_DEVICE - ? inference_device_set - : inference_server_set)); + = {"w/ padding in flops", "w/o padding in flops"}; + printf("OP,w,h,c,n,k,filter_w,filter_h,pad_w,pad_h,wstride,hstride,usecs,gops\n"); for (auto m : modes) { - if (!csv_output) - printf(" %s Convolution\n", conv_mode_strs[m]); - for (const auto& problem : problems) { + for (const auto& problem : *problems) { conv_problem p; std::tie(p.w, p.h, p.ic, p.minibatch, p.oc, p.fw, p.fh, p.pad_w, p.pad_h, p.stride_w, p.stride_h) = problem; p.iters = ITERS; auto r = bench_conv(p, m, precision, skip_padding); - if (csv_output) - printf("%s,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%e,%e,%e,%e\n", - conv_mode_strs[m], skip_padding, - p.minibatch, p.w, p.h, p.ic, p.oc, p.fw, p.fh, - p.stride_w, p.stride_h, p.pad_w, p.pad_h, - r.min_ms, r.max_gflops, r.avg_ms, r.avg_gflops); - else - printf("W=%d, H=%d, C=%d, N=%d, K=%d, S=%d, R=%d | " - "%s %s min(ms) %.2f; max(gflop/s) %.2f;" - "avg(ms) %.2f; avg(gflop/s) %.2f;\n", - p.w, p.h, p.ic, p.minibatch, p.oc, p.fw, p.fh, - conv_mode_strs[m], skip_padding_strs[skip_padding], - r.min_ms, r.max_gflops, r.avg_ms, r.avg_gflops); + printf("%s,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%f,%f\n", + conv_mode_strs[m], p.w, p.h, p.ic, p.minibatch, p.oc, + p.fw, p.fh,p.pad_w,p.pad_h,p.stride_h,p.stride_w,r.avg_ms*1000.0, r.avg_gflops); fflush(0); } } diff --git a/code/intel/gemm/Makefile b/code/intel/gemm/Makefile index 36e0278..895b467 100644 --- a/code/intel/gemm/Makefile +++ b/code/intel/gemm/Makefile @@ -15,11 +15,11 @@ # ****************************************************************************** CC = icc -CFLAGS = -O2 -Wall -I$(MKLROOT)/include -I../../kernels -qopenmp -std=c++11 +CFLAGS = -O2 -Wall -I$(MKLROOT)/include -I../../kernels -fopenmp -std=c++11 -EXTRALIB = -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_intel_thread.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -liomp5 -lpthread -lm -ldl +EXTRALIB = -Wl,--start-group $(MKLROOT)/lib/libmkl_intel_lp64.a $(MKLROOT)/lib/libmkl_intel_thread.a $(MKLROOT)/lib/libmkl_core.a -Wl,--end-group -L$(MKLROOT)/lib/ -liomp5 -lpthread -lm -ldl -all : sbench sbench_pack +all : sbench sbench_pack ibench_s8u8s32 ibench_s8u8s32 : ibench_s8u8s32.o $(CC) $(CFLAGS) $^ $(EXTRALIB) -o $@ diff --git a/code/intel/gemm/bench.cpp b/code/intel/gemm/bench.cpp index bc92d39..0cc36ef 100644 --- a/code/intel/gemm/bench.cpp +++ b/code/intel/gemm/bench.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -34,7 +35,6 @@ #define FIX_LD(x) (((((x) + 127)/128)*128) + 16) -#define REPEAT 10 #define MKL_MEM_ALIGNMENT (4*1024) #ifdef IGEMM_S8U8S32 @@ -63,7 +63,18 @@ typedef struct gemm_params { int ldc; } gemm_params_t; - +void print_usage() +{ + std::cout << " " << std::endl; + std::cout << std::left << std::setw(30) << "\tARGS" << std::endl; + std::cout << std::left << std::setw(30) << "\t--training|inference|device" << "\tSelect and run the built in input set" << std::endl; + std::cout << std::left << std::setw(30) << "\t--m" << "\tNum rows matrix A" << std::endl; + std::cout << std::left << std::setw(30) << "\t--n" << "\tNum cols matrix B" << std::endl; + std::cout << std::left << std::setw(30) << "\t--k" << "\tNum cols matrix A, rows Matrix B" << std::endl; + std::cout << std::left << std::setw(30) << "\t--ta" << "\tTranspose A" << std::endl; + std::cout << std::left << std::setw(30) << "\t--tb" << "\tTranspose B" << std::endl; + return; +} int main(int argc, char *argv[]) { @@ -80,21 +91,113 @@ int main(int argc, char *argv[]) #ifdef PACKED_API float *AP, *BP; #endif + // DEFAULT settings + int REPEAT = 10; + // Default matrix test size if we are doing a single test + int m, n, k; + m = 128; n = 128; k = 128; + bool ta, tb; + ta = false; tb = false; + std::vector>* p_problem_set = nullptr; - int run_training_set = 1; - if (argc > 1) run_training_set = atoi(argv[1]); + // Use getopt_long here to allow for either driving the benchmark using + // built in tests, or make it a gemm tester + static struct option long_options[] = { + {"training", no_argument, 0, 0}, // These will run the full tests and override customization + {"inference", no_argument, 0, 0}, + {"device", no_argument, 0, 0}, + {"repeat", required_argument, 0, 0}, + {"m", required_argument, 0, 0}, + {"n", required_argument, 0, 0}, + {"k", required_argument, 0, 0}, + {"ta", no_argument, 0, 0}, + {"tb", no_argument, 0, 0}, + {0, 0, 0, 0} + }; - std::vector>* p_problem_set; - if (run_training_set) { - printf("Running the training benchmark (set first program argument to 0 for inference)\n"); - p_problem_set = &training_set; - } else { - printf("Running the inference benchmark (first program argument is 0)\n"); - p_problem_set = &inference_server_set; + int c; + do { + int option_index = 0; + c = getopt_long(argc, argv, "", long_options, &option_index); + switch (c) { + case -1: + break; + case 0: + switch (option_index) { + case 0: + if (p_problem_set == nullptr) { + p_problem_set = &training_set; + std::cout << "Running the training benchmark set" << std::endl; + } + break; + case 1: + if (p_problem_set == nullptr) { + p_problem_set = &inference_server_set; + std::cout << "Running the inference server set" << std::endl; + } + break; + case 2: + if (p_problem_set == nullptr) { + p_problem_set = &inference_device_set; + std::cout << "Running the inference device set" << std::endl; + } + break; + case 3: + REPEAT = std::atoi(optarg); + if (REPEAT <= 0) { + std::cerr << "Invalid repeat parameter spec'ed" << std::endl; + return 0; + } + break; + case 4: + m = std::atoi(optarg); + if (m <= 0) { + std::cerr << "Invalid m parameter spec'ed" << std::endl; + return 0; + } + break; + case 5: + n = std::atoi(optarg); + if (n <= 0) { + std::cerr << "Invalid n parameter spec'ed" << std::endl; + return 0; + } + break; + case 6: + k = std::atoi(optarg); + if (k <= 0) { + std::cerr << "Invalid k parameter spec'ed" << std::endl; + return 0; + } + break; + case 7: + ta = true; + break; + case 8: + tb = true; + break; + default: + break; + } + break; + case '?': + print_usage(); + return 0; + break; + default: + print_usage(); + return 0; + break; + } + } while (c != -1); + + if (p_problem_set == nullptr) { + p_problem_set = new std::vector >(); + p_problem_set->push_back(std::tuple(m, n, k, ta, tb)); } num_gemms = p_problem_set->size(); - gemm_params_t* p_gemm_params = (gemm_params_t*) _mm_malloc(num_gemms*sizeof(gemm_params_t), 64); + gemm_params_t* p_gemm_params = (gemm_params_t*) mkl_malloc(num_gemms*sizeof(gemm_params_t), 64); i = 0; for (const auto &problem : *p_problem_set) { @@ -154,6 +257,9 @@ int main(int argc, char *argv[]) for (i=0; i Date: Tue, 19 Jun 2018 09:54:16 -0500 Subject: [PATCH 2/4] fixup makefiles to remove my funky setup --- code/intel/convolution/mkl_conv/Makefile | 4 ++-- code/intel/gemm/Makefile | 2 +- code/intel/spmm/Makefile | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/code/intel/convolution/mkl_conv/Makefile b/code/intel/convolution/mkl_conv/Makefile index 6f9f0e2..1138f9f 100644 --- a/code/intel/convolution/mkl_conv/Makefile +++ b/code/intel/convolution/mkl_conv/Makefile @@ -30,10 +30,10 @@ ifeq ($(MKLLIB), mklml_intel) EXTRALIB = -L$(MKLROOT)/lib -lmklml_intel endif ifeq ($(MKLLIB), mkl_rt) - EXTRALIB = -L$(MKLROOT)/lib -lmkl_rt \ + EXTRALIB = -L$(MKLROOT)/lib/intel64/ -lmkl_rt \ -Wl,-rpath,$(MKLROOT)/lib/intel64 endif -EXTRALIB += -L$(MKLROOT)/lib -liomp5 -lpthread -lm -ldl +EXTRALIB += -L$(MKLROOT)/lib/intel64/ -liomp5 -lpthread -lm -ldl endif ifeq ($(CONVLIB),MKLDNN) diff --git a/code/intel/gemm/Makefile b/code/intel/gemm/Makefile index 895b467..d972d06 100644 --- a/code/intel/gemm/Makefile +++ b/code/intel/gemm/Makefile @@ -17,7 +17,7 @@ CC = icc CFLAGS = -O2 -Wall -I$(MKLROOT)/include -I../../kernels -fopenmp -std=c++11 -EXTRALIB = -Wl,--start-group $(MKLROOT)/lib/libmkl_intel_lp64.a $(MKLROOT)/lib/libmkl_intel_thread.a $(MKLROOT)/lib/libmkl_core.a -Wl,--end-group -L$(MKLROOT)/lib/ -liomp5 -lpthread -lm -ldl +EXTRALIB = -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_intel_thread.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -L$(MKLROOT)/lib/intel64/ -liomp5 -lpthread -lm -ldl all : sbench sbench_pack ibench_s8u8s32 diff --git a/code/intel/spmm/Makefile b/code/intel/spmm/Makefile index e3f2ebf..fa9d5bd 100644 --- a/code/intel/spmm/Makefile +++ b/code/intel/spmm/Makefile @@ -19,7 +19,7 @@ ifeq ($(MKLROOT),) endif EXTRACXXFLAGS = -I$(MKLROOT)/include -EXTRALIB = -L$(MKLROOT)/lib/ -lmkl_rt \ +EXTRALIB = -L$(MKLROOT)/lib/intel64/ -lmkl_rt \ -Wl,-rpath,$(MKLROOT)/lib/intel64 -liomp5 -lpthread -lm -ldl ifeq ($(DEBUG), 1) @@ -28,7 +28,7 @@ else OPTFLAGS = -O3 -fopenmp endif -CXX = g++ +CXX = icpc CXXFLAGS = -Wall -std=c++11 $(OPTFLAGS) $(EXTRACXXFLAGS) -I../../kernels LFLAGS = -lrt OBJS = spmm_bench.o From e030e6480a53be83f0f2a89cf75edda7af2227b4 Mon Sep 17 00:00:00 2001 From: Geoffrey Blake Date: Mon, 23 Apr 2018 17:20:04 -0500 Subject: [PATCH 3/4] intel/gemm: Add in bindings to test OpenBLAS Signed-off-by: Geoffrey Blake (Geoffrey.Blake@arm.com) --- code/intel/gemm/Makefile | 17 ++++++++-- code/intel/gemm/bench.cpp | 69 ++++++++++++++++++++++++++++++++------- 2 files changed, 71 insertions(+), 15 deletions(-) diff --git a/code/intel/gemm/Makefile b/code/intel/gemm/Makefile index d972d06..b1df381 100644 --- a/code/intel/gemm/Makefile +++ b/code/intel/gemm/Makefile @@ -19,7 +19,12 @@ CFLAGS = -O2 -Wall -I$(MKLROOT)/include -I../../kernels -fopenmp -std=c++11 EXTRALIB = -Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_intel_thread.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -L$(MKLROOT)/lib/intel64/ -liomp5 -lpthread -lm -ldl -all : sbench sbench_pack ibench_s8u8s32 +OPENBLAS_PATH?=/usr/local/openblas + +OPENBLAS_FLAGS = -O2 -Wall -I$(OPENBLAS_PATH)/include -I../../kernels -fopenmp -std=c++11 -DUSE_OPENBLAS +OPENBLAS_LIBS = -L$(OPENBLAS_PATH)/lib -DUSE_OPENBLAS -lpthread -lm -ldl -lopenblas + +all : sbench sbench_pack ibench_s8u8s32 sbench_oblas ibench_s8u8s32 : ibench_s8u8s32.o $(CC) $(CFLAGS) $^ $(EXTRALIB) -o $@ @@ -33,11 +38,17 @@ sbench_pack : sbench_pack.o sbench_pack.o : bench.cpp ../../kernels/gemm_problems.h $(CC) $(CFLAGS) -DPACKED_API -c -o $@ $< +sbench.o: bench.cpp ../../kernels/gemm_problems.h + $(CC) $(CFLAGS) -c -o $@ $< + sbench : sbench.o $(CC) $(CFLAGS) $^ $(EXTRALIB) -o $@ -sbench.o : bench.cpp ../../kernels/gemm_problems.h - $(CC) $(CFLAGS) -c -o $@ $< +sbench_oblas.o : bench.cpp ../../kernels/gemm_problems.h + $(CC) $(OPENBLAS_FLAGS) -c -o $@ $< + +sbench_oblas: sbench_oblas.o + $(CC) $(OPENBLAS_FLAGS) $^ $(OPENBLAS_LIBS) -o $@ clean : rm -f *.o sbench sbench_pack ibench_s8u8s32 diff --git a/code/intel/gemm/bench.cpp b/code/intel/gemm/bench.cpp index 0cc36ef..51c791c 100644 --- a/code/intel/gemm/bench.cpp +++ b/code/intel/gemm/bench.cpp @@ -30,7 +30,12 @@ #include #include +#ifndef USE_OPENBLAS #include +#else +#include +#endif + #include "gemm_problems.h" #define FIX_LD(x) (((((x) + 127)/128)*128) + 16) @@ -53,8 +58,13 @@ typedef struct gemm_params { bool ta; bool tb; +#ifdef USE_OPENBLAS + CBLAS_TRANSPOSE transa; + CBLAS_TRANSPOSE transb; +#else char transa; char transb; +#endif int m; int n; int k; @@ -87,7 +97,7 @@ int main(int argc, char *argv[]) B_TYPE *B; C_TYPE *C, co = 0; float alpha = 1.0, beta = 1.0; - double flops, total_flops = 0., st_time, end_time, ave_time, total_time = 0.; + double flops, total_flops = 0., ave_time, total_time = 0.; #ifdef PACKED_API float *AP, *BP; #endif @@ -197,7 +207,7 @@ int main(int argc, char *argv[]) } num_gemms = p_problem_set->size(); - gemm_params_t* p_gemm_params = (gemm_params_t*) mkl_malloc(num_gemms*sizeof(gemm_params_t), 64); + gemm_params_t* p_gemm_params = (gemm_params_t*) malloc(num_gemms*sizeof(gemm_params_t)); i = 0; for (const auto &problem : *p_problem_set) { @@ -207,21 +217,37 @@ int main(int argc, char *argv[]) if (p_gemm_params[i].ta) { p_gemm_params[i].lda = FIX_LD(p_gemm_params[i].k); sizea = p_gemm_params[i].lda * p_gemm_params[i].m; +#ifdef USE_OPENBLAS + p_gemm_params[i].transa = CblasTrans; +#else p_gemm_params[i].transa = 'T'; +#endif } else { p_gemm_params[i].lda = FIX_LD(p_gemm_params[i].m); sizea = p_gemm_params[i].lda * p_gemm_params[i].k; +#ifdef USE_OPENBLAS + p_gemm_params[i].transa = CblasNoTrans; +#else p_gemm_params[i].transa = 'N'; +#endif } if (p_gemm_params[i].tb) { p_gemm_params[i].ldb = FIX_LD(p_gemm_params[i].n); sizeb = p_gemm_params[i].ldb * p_gemm_params[i].k; +#ifdef USE_OPENBLAS + p_gemm_params[i].transb = CblasTrans; +#else p_gemm_params[i].transb = 'T'; +#endif } else { p_gemm_params[i].ldb = FIX_LD(p_gemm_params[i].k); sizeb = p_gemm_params[i].ldb * p_gemm_params[i].n; +#ifdef USE_OPENBLAS + p_gemm_params[i].transb = CblasNoTrans; +#else p_gemm_params[i].transb = 'N'; +#endif } p_gemm_params[i].ldc = FIX_LD(p_gemm_params[i].m); @@ -239,12 +265,17 @@ int main(int argc, char *argv[]) assert(i == num_gemms); +#ifdef USE_OPENBLAS + A = (A_TYPE*) malloc(sizeof(A_TYPE)*max_sizea); + B = (B_TYPE*) malloc(sizeof(B_TYPE)*max_sizeb); + C = (C_TYPE*) malloc(sizeof(C_TYPE)*max_sizec); +#elif defined(PACKED_API) + AP = sgemm_alloc("A", &max_m, &max_n, &max_k); + BP = sgemm_alloc("B", &max_m, &max_n, &max_k); +#else A = (A_TYPE*) mkl_malloc(sizeof(A_TYPE)*max_sizea, MKL_MEM_ALIGNMENT); B = (B_TYPE*) mkl_malloc(sizeof(B_TYPE)*max_sizeb, MKL_MEM_ALIGNMENT); C = (C_TYPE*) mkl_malloc(sizeof(C_TYPE)*max_sizec, MKL_MEM_ALIGNMENT); -#ifdef PACKED_API - AP = sgemm_alloc("A", &max_m, &max_n, &max_k); - BP = sgemm_alloc("B", &max_m, &max_n, &max_k); #endif #ifdef IGEMM_S8U8S32 @@ -268,7 +299,8 @@ int main(int argc, char *argv[]) // warmup sgemm_compute("P", "P", &p_gemm_params[i].m, &p_gemm_params[i].n, &p_gemm_params[i].k, AP, &p_gemm_params[i].lda, BP, &p_gemm_params[i].ldb, &beta, C, &p_gemm_params[i].ldc); - st_time = dsecnd(); + + auto st_time = std::chrono::steady_clock::now(); for (j = 0; j < REPEAT; ++j) { sgemm_compute("P", "P", &p_gemm_params[i].m, &p_gemm_params[i].n, &p_gemm_params[i].k, AP, &p_gemm_params[i].lda, BP, &p_gemm_params[i].ldb, &beta, C, &p_gemm_params[i].ldc); @@ -278,28 +310,36 @@ int main(int argc, char *argv[]) #ifdef IGEMM_S8U8S32 gemm_s8u8s32(&p_gemm_params[i].transa, &p_gemm_params[i].transb, "F", &p_gemm_params[i].m, &p_gemm_params[i].n, &p_gemm_params[i].k, &alpha, A, &p_gemm_params[i].lda, &ao, B, &p_gemm_params[i].ldb, &bo, &beta, C, &p_gemm_params[i].ldc, &co); +#elif defined(USE_OPENBLAS) + cblas_sgemm(CblasColMajor, p_gemm_params[i].transa, p_gemm_params[i].transb, + p_gemm_params[i].m, p_gemm_params[i].n, p_gemm_params[i].k, alpha, + A, p_gemm_params[i].lda, B, p_gemm_params[i].ldb, beta, C, p_gemm_params[i].ldc); #else sgemm(&p_gemm_params[i].transa, &p_gemm_params[i].transb, &p_gemm_params[i].m, &p_gemm_params[i].n, &p_gemm_params[i].k, &alpha, A, &p_gemm_params[i].lda, B, &p_gemm_params[i].ldb, &beta, C, &p_gemm_params[i].ldc); #endif // time measurements - st_time = dsecnd(); + auto st_time = std::chrono::steady_clock::now(); for (j = 0; j < REPEAT; ++j) { #ifdef IGEMM_S8U8S32 gemm_s8u8s32(&p_gemm_params[i].transa, &p_gemm_params[i].transb, "F", &p_gemm_params[i].m, &p_gemm_params[i].n, &p_gemm_params[i].k, &alpha, A, &p_gemm_params[i].lda, &ao, B, &p_gemm_params[i].ldb, &bo, &beta, C, &p_gemm_params[i].ldc, &co); +#elif defined(USE_OPENBLAS) + cblas_sgemm(CblasColMajor, p_gemm_params[i].transa, p_gemm_params[i].transb, + p_gemm_params[i].m, p_gemm_params[i].n, p_gemm_params[i].k, alpha, + A, p_gemm_params[i].lda, B, p_gemm_params[i].ldb, beta, C, p_gemm_params[i].ldc); #else sgemm(&p_gemm_params[i].transa, &p_gemm_params[i].transb, &p_gemm_params[i].m, &p_gemm_params[i].n, &p_gemm_params[i].k, &alpha, A, &p_gemm_params[i].lda, B, &p_gemm_params[i].ldb, &beta, C, &p_gemm_params[i].ldc); #endif } #endif - end_time = dsecnd(); + auto end_time = std::chrono::steady_clock::now(); flops = 2.*p_gemm_params[i].m*p_gemm_params[i].n*p_gemm_params[i].k; total_flops += flops; - ave_time = 1E6*(end_time - st_time)/REPEAT; + ave_time = std::chrono::duration(end_time - st_time).count() /REPEAT; total_time += ave_time; #ifdef IGEMM_S8U8S32 @@ -315,12 +355,17 @@ int main(int argc, char *argv[]) #endif } +#ifdef USE_OPENBLAS + free(A); + free(B); + free(C); +#elif defined(PACKED_API) + sgemm_free(AP); + sgemm_free(BP); +#else mkl_free(A); mkl_free(B); mkl_free(C); -#ifdef PACKED_API - sgemm_free(AP); - sgemm_free(BP); #endif #ifdef IGEMM_S8U8S32 From a03af16cea1be09b2690cb6d4d954e5f8e02ea34 Mon Sep 17 00:00:00 2001 From: Geoffrey Blake Date: Wed, 20 Jun 2018 07:20:40 -0500 Subject: [PATCH 4/4] intel/convolution/mkl: Fixup typo introduced by previous commit Signed-off-by: Geoffrey Blake (Geoffrey.Blake@arm.com) --- code/intel/convolution/mkl_conv/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/intel/convolution/mkl_conv/Makefile b/code/intel/convolution/mkl_conv/Makefile index 1138f9f..07daa1e 100644 --- a/code/intel/convolution/mkl_conv/Makefile +++ b/code/intel/convolution/mkl_conv/Makefile @@ -53,7 +53,7 @@ OPTFLAGS = -O3 endif CXX = icpc -CXXFLAGS = -Wall -std=c++11 $(OPTFLAGS) $(EXTRACXXFLAGS) -fopenmp -I../../../kernelss +CXXFLAGS = -Wall -std=c++11 $(OPTFLAGS) $(EXTRACXXFLAGS) -fopenmp -I../../../kernels LFLAGS = -lrt OBJS = std_conv_bench.o