From 3479af17f3b1cdee1ea3a6f2b47a6500b3b8ae28 Mon Sep 17 00:00:00 2001 From: GokayGulsoy Date: Sun, 10 Dec 2023 20:45:45 +0300 Subject: [PATCH] measurement files added --- ECRv2/.vscode/c_cpp_properties.json | 4 +- ECRv2/README.md | 15 +- ECRv2/src/cudnn/Makefile | 304 ------------- ECRv2/src/cudnn/README.md | 5 - ECRv2/src/cudnn/cudnn_half.cu | 426 ----------------- ECRv2/src/cudnn/samples_common.mk | 34 -- ECRv2/src/cudnn/time.txt | 32 -- ECRv2/src/cudnn/time_fast/time_resnet.txt | 47 -- ECRv2/src/cudnn/time_fast/time_vgg.txt | 16 - .../src/cudnn/time_fft_tiling/time_resnet.txt | 47 -- ECRv2/src/cudnn/time_fft_tiling/time_vgg.txt | 16 - ECRv2/src/cudnn/time_gemm/time_resnet.txt | 47 -- ECRv2/src/cudnn/time_gemm/time_vgg.txt | 16 - ECRv2/src/cudnn/time_im_gemm/time_resnet.txt | 47 -- ECRv2/src/cudnn/time_im_gemm/time_vgg.txt | 16 - ECRv2/src/cudnn/time_resnet/batchsize32.txt | 47 -- ECRv2/src/cudnn/time_resnet/cudnn_half.cu | 427 ------------------ ECRv2/src/cudnn/time_sparsity/batchsize32.txt | 9 - ECRv2/src/cudnn/time_sparsity/cudnn_half.cu | 425 ----------------- .../cudnn/time_stride/stride1_batchsize32.txt | 16 - .../cudnn/time_stride/stride2_batchsize32.txt | 16 - .../cudnn/time_stride/stride3_batchsize32.txt | 16 - ECRv2/src/cudnn/time_vgg/batchsize32.txt | 16 - ECRv2/src/cudnn/time_vgg/cudnn_half.cu | 426 ----------------- ECRv2/times_vgg/singleECR_times.txt | 16 + dataset/resnet/kernel_name.txt | 2 +- dataset/vggdata/all_conv_name.txt | 2 +- dataset/vggdata/kernel_name.txt | 32 +- speedup/resnet/cuDNN_fast | 0 speedup/resnet/cuDNN_fft | 49 ++ speedup/resnet/cuDNN_gemm.txt | 49 ++ speedup/resnet/cuDNN_imp_gemm.tx | 49 ++ speedup/resnet/singleECR.txt | 49 ++ speedup/resnet/speedup.ipynb | 6 +- 34 files changed, 246 insertions(+), 2478 deletions(-) delete mode 100644 ECRv2/src/cudnn/Makefile delete mode 100644 ECRv2/src/cudnn/README.md delete mode 100644 ECRv2/src/cudnn/cudnn_half.cu delete mode 100644 ECRv2/src/cudnn/samples_common.mk delete mode 100644 ECRv2/src/cudnn/time.txt delete mode 100644 ECRv2/src/cudnn/time_fast/time_resnet.txt delete mode 100644 ECRv2/src/cudnn/time_fast/time_vgg.txt delete mode 100644 ECRv2/src/cudnn/time_fft_tiling/time_resnet.txt delete mode 100644 ECRv2/src/cudnn/time_fft_tiling/time_vgg.txt delete mode 100644 ECRv2/src/cudnn/time_gemm/time_resnet.txt delete mode 100644 ECRv2/src/cudnn/time_gemm/time_vgg.txt delete mode 100644 ECRv2/src/cudnn/time_im_gemm/time_resnet.txt delete mode 100644 ECRv2/src/cudnn/time_im_gemm/time_vgg.txt delete mode 100644 ECRv2/src/cudnn/time_resnet/batchsize32.txt delete mode 100644 ECRv2/src/cudnn/time_resnet/cudnn_half.cu delete mode 100644 ECRv2/src/cudnn/time_sparsity/batchsize32.txt delete mode 100644 ECRv2/src/cudnn/time_sparsity/cudnn_half.cu delete mode 100644 ECRv2/src/cudnn/time_stride/stride1_batchsize32.txt delete mode 100644 ECRv2/src/cudnn/time_stride/stride2_batchsize32.txt delete mode 100644 ECRv2/src/cudnn/time_stride/stride3_batchsize32.txt delete mode 100644 ECRv2/src/cudnn/time_vgg/batchsize32.txt delete mode 100644 ECRv2/src/cudnn/time_vgg/cudnn_half.cu create mode 100644 ECRv2/times_vgg/singleECR_times.txt create mode 100644 speedup/resnet/cuDNN_fast create mode 100644 speedup/resnet/cuDNN_fft create mode 100644 speedup/resnet/cuDNN_gemm.txt create mode 100644 speedup/resnet/cuDNN_imp_gemm.tx create mode 100644 speedup/resnet/singleECR.txt diff --git a/ECRv2/.vscode/c_cpp_properties.json b/ECRv2/.vscode/c_cpp_properties.json index b1d7633..f898253 100644 --- a/ECRv2/.vscode/c_cpp_properties.json +++ b/ECRv2/.vscode/c_cpp_properties.json @@ -4,7 +4,9 @@ "name": "Linux", "includePath": [ "${workspaceFolder}/**", - "/usr/local/cuda-12.1/targets/x86_64-linux/include" + "/usr/local/cuda-12.1/targets/x86_64-linux/include/**", + "/home/gokay/llvm-project/llvm/include/**", + "/usr/include/llvm-16" ], "defines": [], "compilerPath": "/home/gokay/llvm-project/build/bin/clang", diff --git a/ECRv2/README.md b/ECRv2/README.md index 911bf19..5285821 100644 --- a/ECRv2/README.md +++ b/ECRv2/README.md @@ -1,6 +1,8 @@ To use LLVM library together with CUDA CMake can be used: -## Requirements + +## Requirements + - ninja - cmake - clang and lld @@ -23,24 +25,27 @@ cmake -G Ninja -S .. -B . \ ninja ``` -# singleECR + +# singleECR + singleECR driver allows us to run single convolution operations per feature and kernel. When the code is built its binary is under build directory. Time measurement is printed to console. ## Usage + Feature and kernel paths can be provided via command line arguments. Optionally, output directory can be provided for the resulting convolution. ```shell ./singleECR --kernel --feature ``` + For example, ```shell ./singleECR --kernel ../../dataset/resnet/kernel/layer3.2.conv2.weight --feature ../../dataset/resnet/feature/feature38 --output singleECR_result.txt ``` + ## Testing -(TODO_GOKAY A python script that takes in the kernel, feature and optionally the output of singlECR) ```shell -python3 verify_convolution.py --kernel ../../dataset/resnet/kernel/layer3.2.conv2.weight --feature ../../dataset/resnet/feature/feature38 --test_output singleECR_result.txt +python3 conv_test.py --kernel ../../dataset/resnet/kernel/layer3.2.conv2.weight --feature ../../dataset/resnet/feature/feature38 --test_output singleECR_result.txt ``` - diff --git a/ECRv2/src/cudnn/Makefile b/ECRv2/src/cudnn/Makefile deleted file mode 100644 index 4b30a18..0000000 --- a/ECRv2/src/cudnn/Makefile +++ /dev/null @@ -1,304 +0,0 @@ -################################################################################ -# -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. -# -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. -# -################################################################################ -# -# Makefile project only supported on Mac OS X and Linux Platforms) -# -################################################################################ - -# Location of the CUDA Toolkit -CUDA_PATH ?= "/usr/local/cuda-12.1" - -############################## -# start deprecated interface # -############################## -ifeq ($(x86_64),1) - $(info WARNING - x86_64 variable has been deprecated) - $(info WARNING - please use TARGET_ARCH=x86_64 instead) - TARGET_ARCH ?= x86_64 -endif -ifeq ($(ARMv7),1) - $(info WARNING - ARMv7 variable has been deprecated) - $(info WARNING - please use TARGET_ARCH=armv7l instead) - TARGET_ARCH ?= armv7l -endif -ifeq ($(aarch64),1) - $(info WARNING - aarch64 variable has been deprecated) - $(info WARNING - please use TARGET_ARCH=aarch64 instead) - TARGET_ARCH ?= aarch64 -endif -ifeq ($(ppc64le),1) - $(info WARNING - ppc64le variable has been deprecated) - $(info WARNING - please use TARGET_ARCH=ppc64le instead) - TARGET_ARCH ?= ppc64le -endif -ifneq ($(GCC),) - $(info WARNING - GCC variable has been deprecated) - $(info WARNING - please use HOST_COMPILER=$(GCC) instead) - HOST_COMPILER ?= $(GCC) -endif -ifneq ($(abi),) - $(error ERROR - abi variable has been removed) -endif -############################ -# end deprecated interface # -############################ - -# architecture -HOST_ARCH := $(shell uname -m) -TARGET_ARCH ?= $(HOST_ARCH) -ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) - ifneq ($(TARGET_ARCH),$(HOST_ARCH)) - ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) - TARGET_SIZE := 64 - else ifneq (,$(filter $(TARGET_ARCH),armv7l)) - TARGET_SIZE := 32 - endif - else - TARGET_SIZE := $(shell getconf LONG_BIT) - endif -else - $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) -endif -ifneq ($(TARGET_ARCH),$(HOST_ARCH)) - ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) - $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) - endif -endif - -# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l -ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) - TARGET_ARCH = armv7l -endif - -# operating system -HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") -TARGET_OS ?= $(HOST_OS) -ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) - $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) -endif - -# host compiler -ifeq ($(TARGET_OS),darwin) - ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) - HOST_COMPILER ?= clang++ - endif -else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) - ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) - ifeq ($(TARGET_OS),linux) - HOST_COMPILER ?= arm-linux-gnueabihf-g++ - else ifeq ($(TARGET_OS),qnx) - ifeq ($(QNX_HOST),) - $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) - endif - ifeq ($(QNX_TARGET),) - $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) - endif - export QNX_HOST - export QNX_TARGET - HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ - else ifeq ($(TARGET_OS),android) - HOST_COMPILER ?= arm-linux-androideabi-g++ - endif - else ifeq ($(TARGET_ARCH),aarch64) - ifeq ($(TARGET_OS), linux) - HOST_COMPILER ?= aarch64-linux-gnu-g++ - else ifeq ($(TARGET_OS),qnx) - ifeq ($(QNX_HOST),) - $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) - endif - ifeq ($(QNX_TARGET),) - $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) - endif - export QNX_HOST - export QNX_TARGET - HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ - else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-clang++ - endif - else ifeq ($(TARGET_ARCH),ppc64le) - HOST_COMPILER ?= powerpc64le-linux-gnu-g++ - endif -endif -HOST_COMPILER ?= g++ -NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) - -# internal flags -NVCCFLAGS := -m${TARGET_SIZE} -CCFLAGS := -LDFLAGS := - -# build flags -ifeq ($(TARGET_OS),darwin) - LDFLAGS += -rpath $(CUDA_PATH)/lib - CCFLAGS += -arch $(HOST_ARCH) -else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) - LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 - CCFLAGS += -mfloat-abi=hard -else ifeq ($(TARGET_OS),android) - LDFLAGS += -pie - CCFLAGS += -fpie -fpic -fexceptions -endif - -ifneq ($(TARGET_ARCH),$(HOST_ARCH)) - ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) - ifneq ($(TARGET_FS),) - GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) - ifeq ($(GCCVERSIONLTEQ46),1) - CCFLAGS += --sysroot=$(TARGET_FS) - endif - LDFLAGS += --sysroot=$(TARGET_FS) - LDFLAGS += -rpath-link=$(TARGET_FS)/lib - LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib - LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf - endif - endif - ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) - ifneq ($(TARGET_FS),) - GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) - ifeq ($(GCCVERSIONLTEQ46),1) - CCFLAGS += --sysroot=$(TARGET_FS) - endif - LDFLAGS += --sysroot=$(TARGET_FS) - LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib - LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib - LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu - LDFLAGS += --unresolved-symbols=ignore-in-shared-libs - CCFLAGS += -isystem=$(TARGET_FS)/usr/include - CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu - endif - endif -endif - -ifeq ($(TARGET_OS),qnx) - CCFLAGS += -DWIN_INTERFACE_CUSTOM - LDFLAGS += -lsocket -endif - -# Install directory of different arch -CUDA_INSTALL_TARGET_DIR := -ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) - CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ -else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) - CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ -else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) - CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ -else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) - CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ -else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) - CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ -else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) - CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ -else ifeq ($(TARGET_ARCH),ppc64le) - CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ -endif - -# Debug build flags -ifeq ($(dbg),1) - NVCCFLAGS += -g -G - BUILD_TYPE := debug -else - BUILD_TYPE := release -endif - -ALL_CCFLAGS := -ALL_CCFLAGS += $(NVCCFLAGS) -ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) -ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) -ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) - -SAMPLE_ENABLED := 1 - -ALL_LDFLAGS := -ALL_LDFLAGS += $(ALL_CCFLAGS) -ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) -ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) - -# Common includes and paths for CUDA -INCLUDES := -I../../common/inc -LIBRARIES := - -################################################################################ - -# Gencode arguments -SMS ?= 75 - -ifeq ($(SMS),) -$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) -SAMPLE_ENABLED := 0 -endif - -ifeq ($(GENCODE_FLAGS),) -# Generate SASS code for each SM architecture listed in $(SMS) -$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) - -# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility -HIGHEST_SM := $(lastword $(sort $(SMS))) -ifneq ($(HIGHEST_SM),) -GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) -endif -endif - -LIBRARIES += -lcublas -lcudnn - -ifeq ($(SAMPLE_ENABLED),0) -EXEC ?= @echo "[@]" -endif - -################################################################################ - -# Target rules -all: build - -build: cudnn.cubin - -check.deps: -ifeq ($(SAMPLE_ENABLED),0) - @echo "Sample will be waived due to the above missing dependencies" -else - @echo "Sample is ready - all dependencies have been met" -endif - -cudnn.o:cudnn.cu - $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< - -cudnn.cubin: cudnn.o - $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) - $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) - $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) - -run: build - $(EXEC) ./cscc - -clean: - rm -f cscc cscc.o - rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cscc - -clobber: clean diff --git a/ECRv2/src/cudnn/README.md b/ECRv2/src/cudnn/README.md deleted file mode 100644 index 95018f7..0000000 --- a/ECRv2/src/cudnn/README.md +++ /dev/null @@ -1,5 +0,0 @@ -Change CUDA path with `CUDA_PATH` variable if it's in a non-standard location. - -``` -CUDA_PATH=/opt/cuda make -``` diff --git a/ECRv2/src/cudnn/cudnn_half.cu b/ECRv2/src/cudnn/cudnn_half.cu deleted file mode 100644 index 11957e3..0000000 --- a/ECRv2/src/cudnn/cudnn_half.cu +++ /dev/null @@ -1,426 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "cuda_fp16.h" - -using namespace std; - -#define CUDA_CALL(f) \ - { \ - cudaError_t err = (f); \ - if (err != cudaSuccess) \ - { \ - std::cout \ - << " Error occurred: " << err << std::endl; \ - std::exit(1); \ - } \ - } - -#define CUDNN_CALL(f) \ - { \ - cudnnStatus_t err = (f); \ - if (err != CUDNN_STATUS_SUCCESS) \ - { \ - std::cout \ - << " Error occurred: " << err << std::endl; \ - std::exit(1); \ - } \ - } - -void print(const float *data, int n, int c, int h, int w) -{ - std::vector buffer(1 << 20); - CUDA_CALL(cudaMemcpy( - buffer.data(), data, - n * c * h * w * sizeof(float), - cudaMemcpyDeviceToHost)); - int a = 0; - for (int i = 0; i < n; ++i) - { - for (int j = 0; j < c; ++j) - { - std::cout << "n=" << i << ", c=" << j << ":" << std::endl; - for (int k = 0; k < h; ++k) - { - for (int l = 0; l < w; ++l) - { - std::cout << std::setw(4) << std::right << buffer[a]; - ++a; - } - std::cout << std::endl; - } - } - } - std::cout << std::endl; -} - -float *LoadKernel(string name, int *&kernel_width, int *&kernel_height, int batch_size, int index) -{ - // ifstream kernel_shape("/home/syt/conv_pool/conv_pool/dataset/kernel/kernel_shape/" + name); - // for (int i = 0; i < 2; i++) - // { - // kernel_shape >> *kernel_width >> *kernel_height; - // } - int shape[49] = {7, - 1, 3, 1, 1, 3, 1, 1, 3, 1, - 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, - 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, - 1, 3, 1, 1, 3, 1, 1, 3, 1}; - *kernel_width = *kernel_height = shape[index]; - float *kernel = new float[*kernel_width * *kernel_height * batch_size]; - for (int i = 0; i < batch_size; i++) - { - ifstream kernel_data("dataset/resnetdataset_all/kernel/" + name); - for (int j = i * (*kernel_width * *kernel_height); j < (i + 1) * (*kernel_width * *kernel_height); j++) - kernel_data >> kernel[j]; - kernel_data.close(); - } - - return kernel; -} - -half *LoadvggKernel(string name, int *&kernel_width, int *&kernel_height, int batch_size) -{ - float temp; - *kernel_width = *kernel_height = 3; - half *kernel = new half[*kernel_width * *kernel_height * batch_size]; - for (int i = 0; i < batch_size; i++) - { - ifstream kernel_data("/home/lfa/fsy/syt/conv_pool/dataset/vggdata/kernel/" + name +".txt"); - for (int j = i * (*kernel_width * *kernel_height); j < (i + 1) * (*kernel_width * *kernel_height); j++) - { - kernel_data >> temp; - kernel[j]=__float2half(temp); - } - kernel_data.close(); - } - - return kernel; -} - -float *LoadConvWeight(int *&fea_width, int *&fea_height, int batch_size, int index) -{ - string *name = new string[49]; - - ifstream fea_name("dataset/resnetdataset_all/feature_name.txt"); - for (int i = 0; i < 49; i++) - { - fea_name >> name[i]; - } - fea_name.close(); - - // int shape[11] = {224, 112, 56, 56, 56, 28, 28, 28, 14, 14, 14}; - // int shape = 28; - int shape[49] = {224, - 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, - 7, 7, 7, 7, 7, 7, 7}; - - *fea_width = *fea_height = shape[index]; - // *fea_width = *fea_height = shape; - - float *feature = new float[batch_size * *fea_width * *fea_height]; - - for (int i = 0; i < batch_size; i++) - { - ifstream fea_data("dataset/resnetdataset_all/feature/" + name[index]); - for (int j = i * (*fea_width * *fea_height); j < (i + 1) * (*fea_width * *fea_height); j++) - fea_data >> feature[j]; - fea_data.close(); - } - - return feature; -} - -half *LoadVggConvWeigth(int *&fea_width, int *&fea_height, int batch_size, int index) -{ - string *name = new string[16]; - - ifstream fea_name("/home/lfa/fsy/syt/conv_pool/dataset/vggdata/all_conv_name.txt"); - for (int i = 0; i < 16; i++) - { - fea_name >> name[i]; - } - fea_name.close(); - - // int shape[11] = {224, 112, 56, 56, 56, 28, 28, 28, 14, 14, 14}; - // int shape = 28; - int shape[16] = {224, 224, 112, 112, 56, 56, 56, 56, 28, 28, 28, 28, 14, 14, 14, 14}; - - *fea_width = *fea_height = shape[index]; - // *fea_width = *fea_height = shape; - float temp; - half *feature = new half[batch_size * *fea_width * *fea_height]; - - for (int i = 0; i < batch_size; i++) - { - ifstream fea_data("/home/lfa/fsy/syt/conv_pool/dataset/vggdata/feature/" + name[index]); - for (int j = i * (*fea_width * *fea_height); j < (i + 1) * (*fea_width * *fea_height); j++) - { - fea_data >> temp; - feature[j]=__float2half(temp); - } - fea_data.close(); - } - - return feature; -} - -float *LoadspConvWeigth(int *&fea_width, int *&fea_height, int batch_size, int index) -{ - string *name = new string[9]; - - ifstream fea_name("sparsity/dataset/sparsity_name.txt"); - for (int i = 0; i < 9; i++) - { - fea_name >> name[i]; - } - fea_name.close(); - - // int shape[11] = {224, 112, 56, 56, 56, 28, 28, 28, 14, 14, 14}; - int shape = 28; - - // *fea_width = *fea_height = shape[index]; - *fea_width = *fea_height = shape; - - float *feature = new float[batch_size * *fea_width * *fea_height]; - - for (int i = 0; i < batch_size; i++) - { - ifstream fea_data("sparsity/dataset/" + name[index]); - for (int j = i * (*fea_width * *fea_height); j < (i + 1) * (*fea_width * *fea_height); j++) - fea_data >> feature[j]; - fea_data.close(); - } - - return feature; -} - -int main(int argc, char *argv[]) -{ - int batch_size = atoi(argv[1]); - // load kernel file name - // string kername_name[49]; - // ifstream open_kernel("/home/syt/conv_pool/conv_pool/dataset/resnetdataset_all/kernel_name.txt"); - string kername_name[16]; - ifstream open_kernel("/home/lfa/fsy/syt/conv_pool/dataset/vggdata/kernel_name.txt"); - for (int i = 0; i < 16; i++) - { - open_kernel >> kername_name[i]; - } - open_kernel.close(); - - ofstream time_file(string("/home/lfa/fsy/syt/conv_pool/ECR/cudnn/time_2080/half_batchsize") + argv[1] + string(".txt")); - // ofstream time_file(string("/home/syt/conv_pool/conv_pool/ECR/cudnn/time_gemm/batchsize") + argv[1] + string(".txt")); - - for (int t = 0; t < 16; t++) - { - cudnnHandle_t cudnn; - CUDNN_CALL(cudnnCreate(&cudnn)); - - // input - // int batch_size = 2; - int *fea_width_ = new int; - int *fea_height_ = new int; - half *matrix; - // matrix = LoadConvWeight(fea_width_, fea_height_, batch_size, i); - matrix = LoadVggConvWeigth(fea_width_, fea_height_, batch_size, t); - // matrix = LoadspConvWeigth(fea_width_, fea_height_, batch_size, i); - - const int in_n = batch_size; - const int in_c = 1; - const int in_h = *fea_width_; - const int in_w = *fea_height_; - const int in_size = in_h * in_w * in_c * in_n; - - // cout << "in:" << in_n << " " << in_c << " " << in_h << " " << in_w << endl; - - // filter - // int stride = 1; - int *kernel_width_ = new int; - int *kernel_height_ = new int; - half *kernel; - - // kernel = LoadKernel(kername_name[i], kernel_width_, kernel_height_, batch_size, i); - - kernel = LoadvggKernel(kername_name[t], kernel_width_, kernel_height_, batch_size); - const int filt_k = 1; - const int filt_c = 1; - const int filt_h = *kernel_width_; - const int filt_w = *kernel_height_; - const int file_size = filt_h * filt_w * filt_c * filt_k; - - // cout << "ker:" << filt_k << " " << filt_c << " " << filt_h << " " << filt_w << endl; - - // 记录时间 - cudaEvent_t start, stop; - float elapsed_time = 0.0; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start, 0); - - cudnnTensorDescriptor_t in_desc; - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc)); - CUDNN_CALL(cudnnSetTensor4dDescriptor( - in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, - in_n, in_c, in_h, in_w)); - - half *in_data; - CUDA_CALL(cudaMalloc( - &in_data, in_n * in_c * in_h * in_w * sizeof(float)/2)); - - cudnnFilterDescriptor_t filt_desc; - CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc)); - CUDNN_CALL(cudnnSetFilter4dDescriptor( - filt_desc, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, - filt_k, filt_c, filt_h, filt_w)); - - half *filt_data; - CUDA_CALL(cudaMalloc( - &filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float)/2)); - - // convolution - const int pad_h = 0; - const int pad_w = 0; - const int str_h = 1; - const int str_w = 1; - const int dil_h = 1; - const int dil_w = 1; - - cudnnConvolutionDescriptor_t conv_desc; - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc)); - CUDNN_CALL(cudnnSetConvolution2dDescriptor( - conv_desc, - pad_h, pad_w, str_h, str_w, dil_h, dil_w, - CUDNN_CONVOLUTION, CUDNN_DATA_HALF)); - // cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH); - // output - int out_n; - int out_c; - int out_h; - int out_w; - - CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim( - conv_desc, in_desc, filt_desc, - &out_n, &out_c, &out_h, &out_w)); - // cout << "out:" << out_n << " " << out_c << " " << out_h << " " << out_w << endl; - - cudnnTensorDescriptor_t out_desc; - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc)); - CUDNN_CALL(cudnnSetTensor4dDescriptor( - out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, - out_n, out_c, out_h, out_w)); - - half *out_data; - CUDA_CALL(cudaMalloc( - &out_data, out_n * out_c * out_h * out_w * sizeof(float)/2)); - - // algorithm - cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - - // = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; - // = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD; - // = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - // = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING; - // = CUDNN_CONVOLUTION_FWD_ALGO_GEMM; - - // CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm( - // cudnn, - // in_desc, filt_desc, conv_desc, out_desc, - // CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo)); - - // workspace - size_t ws_size; - CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize( - cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size)); - - half *ws_data; - CUDA_CALL(cudaMalloc(&ws_data, ws_size)); - - // perform - float alpha = 1.0; - float beta = 0.0; - - /* - float input_data[25 * 3] = {0, 0, 15, 8, 0, 22, 0, 0, 23, 0, 0, 8, 0, 0, 0, 0, 0, 19, 0, 0, 10, 4, 0, 0, 22, - 0, 0, 15, 8, 0, 22, 0, 0, 23, 0, 0, 8, 0, 0, 0, 0, 0, 19, 0, 0, 10, 4, 0, 0, 22, - 0, 0, 15, 8, 0, 22, 0, 0, 23, 0, 0, 8, 0, 0, 0, 0, 0, 19, 0, 0, 10, 4, 0, 0, 22}; - float kernel_data[9 * 3] = {0, 1, 0, 1, 0, 1, 0, 1, 0, - 0, 1, 0, 1, 0, 1, 0, 1, 0, - 0, 1, 0, 1, 0, 1, 0, 1, 0}; - */ - - cudaMemcpy(in_data, matrix, in_size * sizeof(float)/2, cudaMemcpyHostToDevice); - cudaMemcpy(filt_data, kernel, file_size * sizeof(float)/2, cudaMemcpyHostToDevice); - - CUDNN_CALL(cudnnConvolutionForward( - cudnn, - &alpha, in_desc, in_data, filt_desc, filt_data, - conv_desc, algo, ws_data, ws_size, - &beta, out_desc, out_data)); - - // results - // std::cout << "in_data:" << std::endl; - // print(in_data, in_n, in_c, in_h, in_w); - - // std::cout << "filt_data:" << std::endl; - // print(filt_data, filt_k, filt_c, filt_h, filt_w); - - // std::cout << "out_data:" << std::endl; - // print(out_data, out_n, out_c, out_h, out_w); - - int result_size = out_n * out_c * out_h * out_w; - half *result = new half[result_size]; - cudaMemcpy(result, out_data, result_size * sizeof(float)/2, cudaMemcpyDeviceToHost); - - // finalizing - CUDA_CALL(cudaFree(ws_data)); - CUDA_CALL(cudaFree(out_data)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc)); - CUDA_CALL(cudaFree(filt_data)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc)); - CUDA_CALL(cudaFree(in_data)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc)); - CUDNN_CALL(cudnnDestroy(cudnn)); - - // 计算时间 - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&elapsed_time, start, stop); - // cout << elapsed_time << endl; - cudaEventDestroy(start); - cudaEventDestroy(stop); - - // 写入时间 - // ofstream time_file("time/time_batchsize1.txt", ios::app); - time_file << elapsed_time << endl; - // time_file.close(); - // 结果写入文件 - // string file_name = "/home/lfa/fsy/syt/conv_pool/ECR/cudnn/output_vgg/output" + to_string(i); - // // string file_name = "/home/syt/conv_pool/conv_pool/ECR/cudnn/out_gemm/output" + to_string(i); - - // ofstream output_file(file_name); - // for (int i = 0; i < result_size; i++) - // { - // output_file << result[i] << " "; - // if ((i + 1) % out_w == 0) - // output_file << "\n"; - // } - // output_file.close(); - // cout << "res:" << result[1] << endl; - memset(result,0,sizeof(result)); - free(result); - } - - time_file.close(); - - return 0; -} \ No newline at end of file diff --git a/ECRv2/src/cudnn/samples_common.mk b/ECRv2/src/cudnn/samples_common.mk deleted file mode 100644 index 1e15155..0000000 --- a/ECRv2/src/cudnn/samples_common.mk +++ /dev/null @@ -1,34 +0,0 @@ -# Setting SMS for all samples -# architecture - -ifneq ($(TARGET_ARCH), ppc64le) -CUDA_VERSION := $(shell cat $(CUDA_PATH)/include/cuda.h |grep "define CUDA_VERSION" |awk '{print $$3}') -else -CUDA_VERSION := $(shell cat $(CUDA_PATH)/targets/ppc64le-linux/include/cuda.h |grep "define CUDA_VERSION" |awk '{print $$3}') -endif - -#Link against cublasLt for CUDA 10.1 and up. -CUBLASLT:=false -ifeq ($(shell test $(CUDA_VERSION) -ge 10010; echo $$?),0) -CUBLASLT:=true -endif -$(info Linking agains cublasLt = $(CUBLASLT)) - -ifeq ($(CUDA_VERSION),8000 ) -SMS_VOLTA = -else -ifneq ($(TARGET_ARCH), ppc64le) -ifeq ($(CUDA_VERSION), $(filter $(CUDA_VERSION), 9000 9010 9020)) -SMS_VOLTA ?= 70 -else -ifeq ($(TARGET_OS), darwin) -SMS_VOLTA ?= 70 -else -SMS_VOLTA ?= 70 72 75 -endif #ifneq ($(TARGET_OS), darwin) -endif #ifeq ($(CUDA_VERSION), $(filter $(CUDA_VERSION), 9000 9010 9020)) -else -SMS_VOLTA ?= 70 -endif #ifneq ($(TARGET_ARCH), ppc64le) -endif #ifeq ($(CUDA_VERSION),8000 ) -SMS ?= 30 35 50 53 60 61 62 $(SMS_VOLTA) diff --git a/ECRv2/src/cudnn/time.txt b/ECRv2/src/cudnn/time.txt deleted file mode 100644 index 1a3fe39..0000000 --- a/ECRv2/src/cudnn/time.txt +++ /dev/null @@ -1,32 +0,0 @@ -1.36141 -1.18371 -1.18614 -1.15264 -1.28784 -1.22947 -1.28982 -1.12986 -1.15386 -1.09363 -1.12003 -1.5615 -1.20592 -1.08922 -1.12413 -1.12688 -2.23859 -1.07363 -1.10995 -1.0784 -1.09808 -1.0943 -1.1201 -1.07677 -1.10061 -1.09296 -1.09658 -1.08029 -1.10288 -1.06099 -1.07981 -1.10906 diff --git a/ECRv2/src/cudnn/time_fast/time_resnet.txt b/ECRv2/src/cudnn/time_fast/time_resnet.txt deleted file mode 100644 index 6c58ca9..0000000 --- a/ECRv2/src/cudnn/time_fast/time_resnet.txt +++ /dev/null @@ -1,47 +0,0 @@ -1.498842 -1.52997 -1.352204 -1.34096 -1.338912 -1.339878 -1.342982 -1.341786 -1.362706 -1.394674 -1.366912 -1.103188 -1.118106 -1.106032 -1.119622 -1.094246 -1.126834 -1.127086 -1.133862 -1.128378 -1.155532 -1.131976 -1.127642 -1.069882 -1.04939 -1.106502 -1.051886 -1.042112 -1.051886 -1.054822 -1.051218 -1.075576 -1.050414 -1.054728 -1.05929 -1.065522 -1.048626 -1.044998 -1.044262 -1.05353 -1.067334 -1.061644 -1.026886 -1.071644 -1.029904 -1.02352 -1.058042 diff --git a/ECRv2/src/cudnn/time_fast/time_vgg.txt b/ECRv2/src/cudnn/time_fast/time_vgg.txt deleted file mode 100644 index 770d087..0000000 --- a/ECRv2/src/cudnn/time_fast/time_vgg.txt +++ /dev/null @@ -1,16 +0,0 @@ -8.244428 -10.53496 -2.811636 -4.446938 -1.50843 -1.434862 -1.385626 -3.294054 -1.21101 -1.133334 -1.118854 -2.962208 -1.162046 -1.0798 -1.053594 -2.959078 diff --git a/ECRv2/src/cudnn/time_fft_tiling/time_resnet.txt b/ECRv2/src/cudnn/time_fft_tiling/time_resnet.txt deleted file mode 100644 index 384949b..0000000 --- a/ECRv2/src/cudnn/time_fft_tiling/time_resnet.txt +++ /dev/null @@ -1,47 +0,0 @@ -1.1328 -1.34643 -1.06483 -1.02045 -1.07802 -1.00634 -1.00922 -1.05632 -1.06554 -1.04966 -1.1297 -0.683584 -0.679712 -0.707904 -0.736672 -0.725984 -0.733312 -0.732096 -0.731776 -0.745856 -0.750208 -0.734208 -0.771488 -0.663328 -0.671968 -0.679232 -0.6808 -0.658656 -0.674592 -0.657152 -0.658112 -0.6824 -0.659936 -0.65728 -0.671712 -0.712576 -0.659392 -0.702144 -0.678176 -0.663328 -0.700128 -0.649696 -0.674624 -0.663712 -0.651584 -0.648288 -0.70624 diff --git a/ECRv2/src/cudnn/time_fft_tiling/time_vgg.txt b/ECRv2/src/cudnn/time_fft_tiling/time_vgg.txt deleted file mode 100644 index af81e1a..0000000 --- a/ECRv2/src/cudnn/time_fft_tiling/time_vgg.txt +++ /dev/null @@ -1,16 +0,0 @@ -8.785606 -11.63882 -2.561178 -4.449632 -1.238618 -1.17013 -1.143816 -3.094618 -0.9470848 -0.9143168 -0.9148032 -2.78231 -0.8726848 -0.8750272 -0.8773312 -2.698176 diff --git a/ECRv2/src/cudnn/time_gemm/time_resnet.txt b/ECRv2/src/cudnn/time_gemm/time_resnet.txt deleted file mode 100644 index 3574be5..0000000 --- a/ECRv2/src/cudnn/time_gemm/time_resnet.txt +++ /dev/null @@ -1,47 +0,0 @@ -1.457616 -1.546673 -1.425665 -1.416509 -1.443509 -1.459167 -1.49969 -1.484921 -1.511067 -1.505296 -1.475314 -1.131281 -1.514733 -1.108248 -1.143115 -1.142843 -1.129089 -1.137077 -1.134342 -1.14011 -1.13191 -1.144483 -1.136984 -1.076832 -1.050228 -1.042749 -1.059286 -1.05894 -1.05298 -1.069438 -1.05886 -1.050689 -1.055591 -1.066893 -1.266105 -1.101788 -1.059091 -1.062666 -1.06943 -1.063473 -1.063801 -1.037766 -1.033702 -1.078181 -1.05837 -1.037567 -1.041084 diff --git a/ECRv2/src/cudnn/time_gemm/time_vgg.txt b/ECRv2/src/cudnn/time_gemm/time_vgg.txt deleted file mode 100644 index 4af5b24..0000000 --- a/ECRv2/src/cudnn/time_gemm/time_vgg.txt +++ /dev/null @@ -1,16 +0,0 @@ -8.098009 -8.85654 -2.712784 -2.490717 -1.494214 -1.398752 -1.367342 -1.387395 -1.129809 -1.099786 -1.102848 -1.123874 -1.066295 -1.040196 -1.048977 -1.042723 diff --git a/ECRv2/src/cudnn/time_im_gemm/time_resnet.txt b/ECRv2/src/cudnn/time_im_gemm/time_resnet.txt deleted file mode 100644 index 428065b..0000000 --- a/ECRv2/src/cudnn/time_im_gemm/time_resnet.txt +++ /dev/null @@ -1,47 +0,0 @@ -1.259628 -1.358336 -1.16967 -1.1581 -1.225676 -1.16604 -1.176308 -1.23748 -1.229562 -1.245146 -1.305364 -0.9357312 -0.9341312 -0.9417088 -0.994272 -0.9396344 -0.9637892 -0.9418176 -0.942624 -0.9506492 -0.9449728 -0.9389632 -0.9567104 -0.85792 -0.8622912 -0.8721152 -0.870976 -0.860832 -0.8793216 -0.865536 -0.8981576 -0.8808896 -0.852704 -0.8786752 -0.8802112 -0.8541632 -0.8895552 -0.8734144 -0.8550912 -1.0387704 -0.873888 -0.8481536 -0.8515328 -0.8708224 -0.8738048 -0.8366592 -0.8618752 diff --git a/ECRv2/src/cudnn/time_im_gemm/time_vgg.txt b/ECRv2/src/cudnn/time_im_gemm/time_vgg.txt deleted file mode 100644 index 61744f7..0000000 --- a/ECRv2/src/cudnn/time_im_gemm/time_vgg.txt +++ /dev/null @@ -1,16 +0,0 @@ -8.115706 -10.66202 -2.246182 -4.257288 -1.24889 -1.17317 -1.093254 -3.125716 -0.91088 -0.9024384 -0.8913536 -2.780704 -0.8514624 -0.8350848 -0.8339392 -2.669132 diff --git a/ECRv2/src/cudnn/time_resnet/batchsize32.txt b/ECRv2/src/cudnn/time_resnet/batchsize32.txt deleted file mode 100644 index dde5c53..0000000 --- a/ECRv2/src/cudnn/time_resnet/batchsize32.txt +++ /dev/null @@ -1,47 +0,0 @@ -1.457233 -1.439345 -1.422449 -1.390892 -1.404805 -1.399081 -1.437757 -1.429952 -1.445194 -1.42816 -1.470265 -1.127645 -1.092218 -1.13515 -1.143337 -1.136884 -1.123715 -1.121395 -1.1275 -1.120337 -1.129813 -1.155315 -1.152745 -1.086163 -1.053969 -1.079892 -1.073841 -1.070092 -1.067156 -1.060364 -1.069228 -1.062227 -1.056918 -1.061615 -1.054527 -1.175946 -1.133718 -1.080154 -1.075609 -1.050516 -1.053262 -1.06012 -1.035418 -1.063372 -1.029721 -1.034117 -1.03581 diff --git a/ECRv2/src/cudnn/time_resnet/cudnn_half.cu b/ECRv2/src/cudnn/time_resnet/cudnn_half.cu deleted file mode 100644 index b4953e9..0000000 --- a/ECRv2/src/cudnn/time_resnet/cudnn_half.cu +++ /dev/null @@ -1,427 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "cuda_fp16.h" - -using namespace std; - -#define CUDA_CALL(f) \ - { \ - cudaError_t err = (f); \ - if (err != cudaSuccess) \ - { \ - std::cout \ - << " Error occurred: " << err << std::endl; \ - std::exit(1); \ - } \ - } - -#define CUDNN_CALL(f) \ - { \ - cudnnStatus_t err = (f); \ - if (err != CUDNN_STATUS_SUCCESS) \ - { \ - std::cout \ - << " Error occurred: " << err << std::endl; \ - std::exit(1); \ - } \ - } - -void print(const float *data, int n, int c, int h, int w) -{ - std::vector buffer(1 << 20); - CUDA_CALL(cudaMemcpy( - buffer.data(), data, - n * c * h * w * sizeof(float), - cudaMemcpyDeviceToHost)); - int a = 0; - for (int i = 0; i < n; ++i) - { - for (int j = 0; j < c; ++j) - { - std::cout << "n=" << i << ", c=" << j << ":" << std::endl; - for (int k = 0; k < h; ++k) - { - for (int l = 0; l < w; ++l) - { - std::cout << std::setw(4) << std::right << buffer[a]; - ++a; - } - std::cout << std::endl; - } - } - } - std::cout << std::endl; -} - -float *LoadKernel(string name, int *&kernel_width, int *&kernel_height, int batch_size, int index) -{ - // ifstream kernel_shape("/home/syt/conv_pool/conv_pool/dataset/kernel/kernel_shape/" + name); - // for (int i = 0; i < 2; i++) - // { - // kernel_shape >> *kernel_width >> *kernel_height; - // } - int shape[49] = {7, - 1, 3, 1, 1, 3, 1, 1, 3, 1, - 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, - 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, - 1, 3, 1, 1, 3, 1, 1, 3, 1}; - *kernel_width = *kernel_height = shape[index]; - float *kernel = new float[*kernel_width * *kernel_height * batch_size]; - for (int i = 0; i < batch_size; i++) - { - ifstream kernel_data("dataset/resnet/kernel/" + name); - for (int j = i * (*kernel_width * *kernel_height); j < (i + 1) * (*kernel_width * *kernel_height); j++) - kernel_data >> kernel[j]; - kernel_data.close(); - } - - return kernel; -} - -half *LoadvggKernel(string name, int *&kernel_width, int *&kernel_height, int batch_size) -{ - float temp; - *kernel_width = *kernel_height = 3; - half *kernel = new half[*kernel_width * *kernel_height * batch_size]; - for (int i = 0; i < batch_size; i++) - { - ifstream kernel_data("dataset/vggdata/kernel/" + name +".txt"); - for (int j = i * (*kernel_width * *kernel_height); j < (i + 1) * (*kernel_width * *kernel_height); j++) - { - kernel_data >> temp; - kernel[j]=__float2half(temp); - } - kernel_data.close(); - } - - return kernel; -} - -half *LoadConvWeight(int *&fea_width, int *&fea_height, int batch_size, int index) -{ - string *name = new string[49]; - - ifstream fea_name("dataset/resnet/feature_name.txt"); - for (int i = 0; i < 49; i++) - { - fea_name >> name[i]; - } - fea_name.close(); - - // int shape[11] = {224, 112, 56, 56, 56, 28, 28, 28, 14, 14, 14}; - // int shape = 28; - int shape[49] = {224, - 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, - 7, 7, 7, 7, 7, 7, 7}; - - *fea_width = *fea_height = shape[index]; - // *fea_width = *fea_height = shape; - float temp; - float *feature = new float[batch_size * *fea_width * *fea_height]; - - for (int i = 0; i < batch_size; i++) - { - ifstream fea_data("dataset/resnet/kernel/" + name[index]); - for (int j = i * (*fea_width * *fea_height); j < (i + 1) * (*fea_width * *fea_height); j++) - fea_data >> temp; - feature[j] = __float2half(temp); - fea_data.close(); - } - - return feature; -} - -half *LoadVggConvWeigth(int *&fea_width, int *&fea_height, int batch_size, int index) -{ - string *name = new string[16]; - - ifstream fea_name("/home/lfa/fsy/syt/conv_pool/dataset/vggdata/all_conv_name.txt"); - for (int i = 0; i < 16; i++) - { - fea_name >> name[i]; - } - fea_name.close(); - - // int shape[11] = {224, 112, 56, 56, 56, 28, 28, 28, 14, 14, 14}; - // int shape = 28; - int shape[16] = {224, 224, 112, 112, 56, 56, 56, 56, 28, 28, 28, 28, 14, 14, 14, 14}; - - *fea_width = *fea_height = shape[index]; - // *fea_width = *fea_height = shape; - float temp; - half *feature = new half[batch_size * *fea_width * *fea_height]; - - for (int i = 0; i < batch_size; i++) - { - ifstream fea_data("/home/lfa/fsy/syt/conv_pool/dataset/vggdata/feature/" + name[index]); - for (int j = i * (*fea_width * *fea_height); j < (i + 1) * (*fea_width * *fea_height); j++) - { - fea_data >> temp; - feature[j]=__float2half(temp); - } - fea_data.close(); - } - - return feature; -} - -float *LoadspConvWeigth(int *&fea_width, int *&fea_height, int batch_size, int index) -{ - string *name = new string[9]; - - ifstream fea_name("sparsity/dataset/sparsity_name.txt"); - for (int i = 0; i < 9; i++) - { - fea_name >> name[i]; - } - fea_name.close(); - - // int shape[11] = {224, 112, 56, 56, 56, 28, 28, 28, 14, 14, 14}; - int shape = 28; - - // *fea_width = *fea_height = shape[index]; - *fea_width = *fea_height = shape; - - float *feature = new float[batch_size * *fea_width * *fea_height]; - - for (int i = 0; i < batch_size; i++) - { - ifstream fea_data("sparsity/dataset/" + name[index]); - for (int j = i * (*fea_width * *fea_height); j < (i + 1) * (*fea_width * *fea_height); j++) - fea_data >> feature[j]; - fea_data.close(); - } - - return feature; -} - -int main(int argc, char *argv[]) -{ - int batch_size = atoi(argv[1]); - // load kernel file name - string kername_name[49]; - // ifstream open_kernel("/home/syt/conv_pool/conv_pool/dataset/resnetdataset_all/kernel_name.txt"); - // string kername_name[16]; - ifstream open_kernel("dataset/resnet/kernel_name.txt"); - for (int i = 1; i < 49; i++) - { - open_kernel >> kername_name[i]; - } - open_kernel.close(); - - ofstream time_file(string("ECR/ECR/time_resnet/batchsize") + argv[1] + string(".txt")); - // ofstream time_file(string("/home/syt/conv_pool/conv_pool/ECR/cudnn/time_gemm/batchsize") + argv[1] + string(".txt")); - // 1-47 - for (int t = 1; t < 48; t++) - { - cudnnHandle_t cudnn; - CUDNN_CALL(cudnnCreate(&cudnn)); - - // input - // int batch_size = 2; - int *fea_width_ = new int; - int *fea_height_ = new int; - half *matrix; - matrix = LoadConvWeight(fea_width_, fea_height_, batch_size, i); - // matrix = LoadVggConvWeigth(fea_width_, fea_height_, batch_size, t); - // matrix = LoadspConvWeigth(fea_width_, fea_height_, batch_size, i); - - const int in_n = batch_size; - const int in_c = 1; - const int in_h = *fea_width_; - const int in_w = *fea_height_; - const int in_size = in_h * in_w * in_c * in_n; - - // cout << "in:" << in_n << " " << in_c << " " << in_h << " " << in_w << endl; - - // filter - // int stride = 1; - int *kernel_width_ = new int; - int *kernel_height_ = new int; - half *kernel; - - kernel = LoadKernel(kername_name[i], kernel_width_, kernel_height_, batch_size, i); - - // kernel = LoadvggKernel(kername_name[t], kernel_width_, kernel_height_, batch_size); - const int filt_k = 1; - const int filt_c = 1; - const int filt_h = *kernel_width_; - const int filt_w = *kernel_height_; - const int file_size = filt_h * filt_w * filt_c * filt_k; - - // cout << "ker:" << filt_k << " " << filt_c << " " << filt_h << " " << filt_w << endl; - - // 记录时间 - cudaEvent_t start, stop; - float elapsed_time = 0.0; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start, 0); - - cudnnTensorDescriptor_t in_desc; - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc)); - CUDNN_CALL(cudnnSetTensor4dDescriptor( - in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, - in_n, in_c, in_h, in_w)); - - half *in_data; - CUDA_CALL(cudaMalloc( - &in_data, in_n * in_c * in_h * in_w * sizeof(float)/2)); - - cudnnFilterDescriptor_t filt_desc; - CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc)); - CUDNN_CALL(cudnnSetFilter4dDescriptor( - filt_desc, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, - filt_k, filt_c, filt_h, filt_w)); - - half *filt_data; - CUDA_CALL(cudaMalloc( - &filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float)/2)); - - // convolution - const int pad_h = 0; - const int pad_w = 0; - const int str_h = 1; - const int str_w = 1; - const int dil_h = 1; - const int dil_w = 1; - - cudnnConvolutionDescriptor_t conv_desc; - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc)); - CUDNN_CALL(cudnnSetConvolution2dDescriptor( - conv_desc, - pad_h, pad_w, str_h, str_w, dil_h, dil_w, - CUDNN_CONVOLUTION, CUDNN_DATA_HALF)); - // cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH); - // output - int out_n; - int out_c; - int out_h; - int out_w; - - CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim( - conv_desc, in_desc, filt_desc, - &out_n, &out_c, &out_h, &out_w)); - // cout << "out:" << out_n << " " << out_c << " " << out_h << " " << out_w << endl; - - cudnnTensorDescriptor_t out_desc; - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc)); - CUDNN_CALL(cudnnSetTensor4dDescriptor( - out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, - out_n, out_c, out_h, out_w)); - - half *out_data; - CUDA_CALL(cudaMalloc( - &out_data, out_n * out_c * out_h * out_w * sizeof(float)/2)); - - // algorithm - cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - - // = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; - // = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD; - // = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - // = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING; - // = CUDNN_CONVOLUTION_FWD_ALGO_GEMM; - - // CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm( - // cudnn, - // in_desc, filt_desc, conv_desc, out_desc, - // CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo)); - - // workspace - size_t ws_size; - CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize( - cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size)); - - half *ws_data; - CUDA_CALL(cudaMalloc(&ws_data, ws_size)); - - // perform - float alpha = 1.0; - float beta = 0.0; - - /* - float input_data[25 * 3] = {0, 0, 15, 8, 0, 22, 0, 0, 23, 0, 0, 8, 0, 0, 0, 0, 0, 19, 0, 0, 10, 4, 0, 0, 22, - 0, 0, 15, 8, 0, 22, 0, 0, 23, 0, 0, 8, 0, 0, 0, 0, 0, 19, 0, 0, 10, 4, 0, 0, 22, - 0, 0, 15, 8, 0, 22, 0, 0, 23, 0, 0, 8, 0, 0, 0, 0, 0, 19, 0, 0, 10, 4, 0, 0, 22}; - float kernel_data[9 * 3] = {0, 1, 0, 1, 0, 1, 0, 1, 0, - 0, 1, 0, 1, 0, 1, 0, 1, 0, - 0, 1, 0, 1, 0, 1, 0, 1, 0}; - */ - - cudaMemcpy(in_data, matrix, in_size * sizeof(float)/2, cudaMemcpyHostToDevice); - cudaMemcpy(filt_data, kernel, file_size * sizeof(float)/2, cudaMemcpyHostToDevice); - - CUDNN_CALL(cudnnConvolutionForward( - cudnn, - &alpha, in_desc, in_data, filt_desc, filt_data, - conv_desc, algo, ws_data, ws_size, - &beta, out_desc, out_data)); - - // results - // std::cout << "in_data:" << std::endl; - // print(in_data, in_n, in_c, in_h, in_w); - - // std::cout << "filt_data:" << std::endl; - // print(filt_data, filt_k, filt_c, filt_h, filt_w); - - // std::cout << "out_data:" << std::endl; - // print(out_data, out_n, out_c, out_h, out_w); - - int result_size = out_n * out_c * out_h * out_w; - half *result = new half[result_size]; - cudaMemcpy(result, out_data, result_size * sizeof(float)/2, cudaMemcpyDeviceToHost); - - // finalizing - CUDA_CALL(cudaFree(ws_data)); - CUDA_CALL(cudaFree(out_data)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc)); - CUDA_CALL(cudaFree(filt_data)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc)); - CUDA_CALL(cudaFree(in_data)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc)); - CUDNN_CALL(cudnnDestroy(cudnn)); - - // 计算时间 - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&elapsed_time, start, stop); - // cout << elapsed_time << endl; - cudaEventDestroy(start); - cudaEventDestroy(stop); - - // 写入时间 - // ofstream time_file("time/time_batchsize1.txt", ios::app); - time_file << elapsed_time << endl; - // time_file.close(); - // 结果写入文件 - // string file_name = "/home/lfa/fsy/syt/conv_pool/ECR/cudnn/output_vgg/output" + to_string(i); - // // string file_name = "/home/syt/conv_pool/conv_pool/ECR/cudnn/out_gemm/output" + to_string(i); - - // ofstream output_file(file_name); - // for (int i = 0; i < result_size; i++) - // { - // output_file << result[i] << " "; - // if ((i + 1) % out_w == 0) - // output_file << "\n"; - // } - // output_file.close(); - // cout << "res:" << result[1] << endl; - memset(result,0,sizeof(result)); - free(result); - } - - time_file.close(); - - return 0; -} \ No newline at end of file diff --git a/ECRv2/src/cudnn/time_sparsity/batchsize32.txt b/ECRv2/src/cudnn/time_sparsity/batchsize32.txt deleted file mode 100644 index 27af9f6..0000000 --- a/ECRv2/src/cudnn/time_sparsity/batchsize32.txt +++ /dev/null @@ -1,9 +0,0 @@ -1.13853 -0.752192 -0.740608 -0.760096 -0.781472 -0.746464 -0.805792 -0.7592 -0.760256 diff --git a/ECRv2/src/cudnn/time_sparsity/cudnn_half.cu b/ECRv2/src/cudnn/time_sparsity/cudnn_half.cu deleted file mode 100644 index 510279f..0000000 --- a/ECRv2/src/cudnn/time_sparsity/cudnn_half.cu +++ /dev/null @@ -1,425 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "cuda_fp16.h" - -using namespace std; - -#define CUDA_CALL(f) \ - { \ - cudaError_t err = (f); \ - if (err != cudaSuccess) \ - { \ - std::cout \ - << " Error occurred: " << err << std::endl; \ - std::exit(1); \ - } \ - } - -#define CUDNN_CALL(f) \ - { \ - cudnnStatus_t err = (f); \ - if (err != CUDNN_STATUS_SUCCESS) \ - { \ - std::cout \ - << " Error occurred: " << err << std::endl; \ - std::exit(1); \ - } \ - } - -void print(const float *data, int n, int c, int h, int w) -{ - std::vector buffer(1 << 20); - CUDA_CALL(cudaMemcpy( - buffer.data(), data, - n * c * h * w * sizeof(float), - cudaMemcpyDeviceToHost)); - int a = 0; - for (int i = 0; i < n; ++i) - { - for (int j = 0; j < c; ++j) - { - std::cout << "n=" << i << ", c=" << j << ":" << std::endl; - for (int k = 0; k < h; ++k) - { - for (int l = 0; l < w; ++l) - { - std::cout << std::setw(4) << std::right << buffer[a]; - ++a; - } - std::cout << std::endl; - } - } - } - std::cout << std::endl; -} - -float *LoadKernel(string name, int *&kernel_width, int *&kernel_height, int batch_size, int index) -{ - // ifstream kernel_shape("/home/syt/conv_pool/conv_pool/dataset/kernel/kernel_shape/" + name); - // for (int i = 0; i < 2; i++) - // { - // kernel_shape >> *kernel_width >> *kernel_height; - // } - int shape[49] = {7, - 1, 3, 1, 1, 3, 1, 1, 3, 1, - 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, - 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, - 1, 3, 1, 1, 3, 1, 1, 3, 1}; - *kernel_width = *kernel_height = shape[index]; - float *kernel = new float[*kernel_width * *kernel_height * batch_size]; - for (int i = 0; i < batch_size; i++) - { - ifstream kernel_data("dataset/resnetdataset_all/kernel/" + name); - for (int j = i * (*kernel_width * *kernel_height); j < (i + 1) * (*kernel_width * *kernel_height); j++) - kernel_data >> kernel[j]; - kernel_data.close(); - } - - return kernel; -} - -half *LoadvggKernel(string name, int *&kernel_width, int *&kernel_height, int batch_size) -{ - float temp; - *kernel_width = *kernel_height = 3; - half *kernel = new half[*kernel_width * *kernel_height * batch_size]; - for (int i = 0; i < batch_size; i++) - { - ifstream kernel_data("/home/lfa/fsy/syt/conv_pool/dataset/vggdata/kernel/" + name +".txt"); - for (int j = i * (*kernel_width * *kernel_height); j < (i + 1) * (*kernel_width * *kernel_height); j++) - { - kernel_data >> temp; - kernel[j]=__float2half(temp); - } - kernel_data.close(); - } - - return kernel; -} - -float *LoadConvWeight(int *&fea_width, int *&fea_height, int batch_size, int index) -{ - string *name = new string[49]; - - ifstream fea_name("dataset/resnetdataset_all/feature_name.txt"); - for (int i = 0; i < 49; i++) - { - fea_name >> name[i]; - } - fea_name.close(); - - // int shape[11] = {224, 112, 56, 56, 56, 28, 28, 28, 14, 14, 14}; - // int shape = 28; - int shape[49] = {224, - 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, - 7, 7, 7, 7, 7, 7, 7}; - - *fea_width = *fea_height = shape[index]; - // *fea_width = *fea_height = shape; - - float *feature = new float[batch_size * *fea_width * *fea_height]; - - for (int i = 0; i < batch_size; i++) - { - ifstream fea_data("dataset/resnetdataset_all/feature/" + name[index]); - for (int j = i * (*fea_width * *fea_height); j < (i + 1) * (*fea_width * *fea_height); j++) - fea_data >> feature[j]; - fea_data.close(); - } - - return feature; -} - -half *LoadVggConvWeigth(int *&fea_width, int *&fea_height, int batch_size, int index) -{ - string *name = new string[16]; - - ifstream fea_name("/home/lfa/fsy/syt/conv_pool/dataset/vggdata/all_conv_name.txt"); - for (int i = 0; i < 16; i++) - { - fea_name >> name[i]; - } - fea_name.close(); - - // int shape[11] = {224, 112, 56, 56, 56, 28, 28, 28, 14, 14, 14}; - // int shape = 28; - int shape[16] = {224, 224, 112, 112, 56, 56, 56, 56, 28, 28, 28, 28, 14, 14, 14, 14}; - - *fea_width = *fea_height = shape[index]; - // *fea_width = *fea_height = shape; - float temp; - half *feature = new half[batch_size * *fea_width * *fea_height]; - - for (int i = 0; i < batch_size; i++) - { - ifstream fea_data("/home/lfa/fsy/syt/conv_pool/dataset/vggdata/feature/" + name[index]); - for (int j = i * (*fea_width * *fea_height); j < (i + 1) * (*fea_width * *fea_height); j++) - { - fea_data >> temp; - feature[j]=__float2half(temp); - } - fea_data.close(); - } - - return feature; -} - -half *LoadspConvWeigth(int *&fea_width, int *&fea_height, int batch_size, int index) -{ - string *name = new string[9]; - - ifstream fea_name("sparsity/dataset/sparsity_name.txt"); - for (int i = 0; i < 9; i++) - { - fea_name >> name[i]; - } - fea_name.close(); - - // int shape[11] = {224, 112, 56, 56, 56, 28, 28, 28, 14, 14, 14}; - int shape = 28; - - // *fea_width = *fea_height = shape[index]; - *fea_width = *fea_height = shape; - - float *feature = new float[batch_size * *fea_width * *fea_height]; - float temp; - for (int i = 0; i < batch_size; i++) - { - ifstream fea_data("dataset/sparsity/dataset/" + name[index]); - for (int j = i * (*fea_width * *fea_height); j < (i + 1) * (*fea_width * *fea_height); j++) - fea_data >> temp; - feature[j] = __float2half(temp); - fea_data.close(); - } - - return feature; -} - -int main(int argc, char *argv[]) -{ - int batch_size = atoi(argv[1]); - // load kernel file name - // string kername_name[49]; - // ifstream open_kernel("/home/syt/conv_pool/conv_pool/dataset/resnetdataset_all/kernel_name.txt"); - string kername_name[9]; - ifstream open_kernel("dataset/sparsity/dataset/sparsity_name.txt"); - for (int i = 0; i < 9; i++) - { - open_kernel >> kername_name[i]; - } - open_kernel.close(); - - ofstream time_file("ECR/ECR/time_sparsity/batchsize" + argv[1] + string(".txt")); - // ofstream time_file(string("/home/syt/conv_pool/conv_pool/ECR/cudnn/time_gemm/batchsize") + argv[1] + string(".txt")); - - for (int t = 0; t < 9; t++) - { - cudnnHandle_t cudnn; - CUDNN_CALL(cudnnCreate(&cudnn)); - - // input - // int batch_size = 2; - int *fea_width_ = new int; - int *fea_height_ = new int; - half *matrix; - // matrix = LoadConvWeight(fea_width_, fea_height_, batch_size, i); - matrix = LoadVggConvWeigth(fea_width_, fea_height_, batch_size, t); - // matrix = LoadspConvWeigth(fea_width_, fea_height_, batch_size, i); - - const int in_n = batch_size; - const int in_c = 1; - const int in_h = *fea_width_; - const int in_w = *fea_height_; - const int in_size = in_h * in_w * in_c * in_n; - - // cout << "in:" << in_n << " " << in_c << " " << in_h << " " << in_w << endl; - - // filter - // int stride = 1; - int *kernel_width_ = new int; - int *kernel_height_ = new int; - half *kernel - - kernel = LoadspConvWeigth(kername_name[t], kernel_width_, kernel_height_, batch_size); - const int filt_k = 1; - const int filt_c = 1; - const int filt_h = *kernel_width_; - const int filt_w = *kernel_height_; - const int file_size = filt_h * filt_w * filt_c * filt_k; - - // cout << "ker:" << filt_k << " " << filt_c << " " << filt_h << " " << filt_w << endl; - - // 记录时间 - cudaEvent_t start, stop; - float elapsed_time = 0.0; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start, 0); - - cudnnTensorDescriptor_t in_desc; - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc)); - CUDNN_CALL(cudnnSetTensor4dDescriptor( - in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, - in_n, in_c, in_h, in_w)); - - half *in_data; - CUDA_CALL(cudaMalloc( - &in_data, in_n * in_c * in_h * in_w * sizeof(float)/2)); - - cudnnFilterDescriptor_t filt_desc; - CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc)); - CUDNN_CALL(cudnnSetFilter4dDescriptor( - filt_desc, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, - filt_k, filt_c, filt_h, filt_w)); - - half *filt_data; - CUDA_CALL(cudaMalloc( - &filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float)/2)); - - // convolution - const int pad_h = 0; - const int pad_w = 0; - const int str_h = 1; - const int str_w = 1; - const int dil_h = 1; - const int dil_w = 1; - - cudnnConvolutionDescriptor_t conv_desc; - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc)); - CUDNN_CALL(cudnnSetConvolution2dDescriptor( - conv_desc, - pad_h, pad_w, str_h, str_w, dil_h, dil_w, - CUDNN_CONVOLUTION, CUDNN_DATA_HALF)); - // cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH); - // output - int out_n; - int out_c; - int out_h; - int out_w; - - CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim( - conv_desc, in_desc, filt_desc, - &out_n, &out_c, &out_h, &out_w)); - // cout << "out:" << out_n << " " << out_c << " " << out_h << " " << out_w << endl; - - cudnnTensorDescriptor_t out_desc; - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc)); - CUDNN_CALL(cudnnSetTensor4dDescriptor( - out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, - out_n, out_c, out_h, out_w)); - - half *out_data; - CUDA_CALL(cudaMalloc( - &out_data, out_n * out_c * out_h * out_w * sizeof(float)/2)); - - // algorithm - cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - - // = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; - // = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD; - // = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - // = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING; - // = CUDNN_CONVOLUTION_FWD_ALGO_GEMM; - - // CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm( - // cudnn, - // in_desc, filt_desc, conv_desc, out_desc, - // CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo)); - - // workspace - size_t ws_size; - CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize( - cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size)); - - half *ws_data; - CUDA_CALL(cudaMalloc(&ws_data, ws_size)); - - // perform - float alpha = 1.0; - float beta = 0.0; - - /* - float input_data[25 * 3] = {0, 0, 15, 8, 0, 22, 0, 0, 23, 0, 0, 8, 0, 0, 0, 0, 0, 19, 0, 0, 10, 4, 0, 0, 22, - 0, 0, 15, 8, 0, 22, 0, 0, 23, 0, 0, 8, 0, 0, 0, 0, 0, 19, 0, 0, 10, 4, 0, 0, 22, - 0, 0, 15, 8, 0, 22, 0, 0, 23, 0, 0, 8, 0, 0, 0, 0, 0, 19, 0, 0, 10, 4, 0, 0, 22}; - float kernel_data[9 * 3] = {0, 1, 0, 1, 0, 1, 0, 1, 0, - 0, 1, 0, 1, 0, 1, 0, 1, 0, - 0, 1, 0, 1, 0, 1, 0, 1, 0}; - */ - - cudaMemcpy(in_data, matrix, in_size * sizeof(float)/2, cudaMemcpyHostToDevice); - cudaMemcpy(filt_data, kernel, file_size * sizeof(float)/2, cudaMemcpyHostToDevice); - - CUDNN_CALL(cudnnConvolutionForward( - cudnn, - &alpha, in_desc, in_data, filt_desc, filt_data, - conv_desc, algo, ws_data, ws_size, - &beta, out_desc, out_data)); - - // results - // std::cout << "in_data:" << std::endl; - // print(in_data, in_n, in_c, in_h, in_w); - - // std::cout << "filt_data:" << std::endl; - // print(filt_data, filt_k, filt_c, filt_h, filt_w); - - // std::cout << "out_data:" << std::endl; - // print(out_data, out_n, out_c, out_h, out_w); - - int result_size = out_n * out_c * out_h * out_w; - half *result = new half[result_size]; - cudaMemcpy(result, out_data, result_size * sizeof(float)/2, cudaMemcpyDeviceToHost); - - // finalizing - CUDA_CALL(cudaFree(ws_data)); - CUDA_CALL(cudaFree(out_data)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc)); - CUDA_CALL(cudaFree(filt_data)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc)); - CUDA_CALL(cudaFree(in_data)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc)); - CUDNN_CALL(cudnnDestroy(cudnn)); - - // 计算时间 - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&elapsed_time, start, stop); - // cout << elapsed_time << endl; - cudaEventDestroy(start); - cudaEventDestroy(stop); - - // 写入时间 - // ofstream time_file("time/time_batchsize1.txt", ios::app); - time_file << elapsed_time << endl; - // time_file.close(); - // 结果写入文件 - // string file_name = "/home/lfa/fsy/syt/conv_pool/ECR/cudnn/output_vgg/output" + to_string(i); - // // string file_name = "/home/syt/conv_pool/conv_pool/ECR/cudnn/out_gemm/output" + to_string(i); - - // ofstream output_file(file_name); - // for (int i = 0; i < result_size; i++) - // { - // output_file << result[i] << " "; - // if ((i + 1) % out_w == 0) - // output_file << "\n"; - // } - // output_file.close(); - // cout << "res:" << result[1] << endl; - memset(result,0,sizeof(result)); - free(result); - } - - time_file.close(); - - return 0; -} \ No newline at end of file diff --git a/ECRv2/src/cudnn/time_stride/stride1_batchsize32.txt b/ECRv2/src/cudnn/time_stride/stride1_batchsize32.txt deleted file mode 100644 index e4fc1a6..0000000 --- a/ECRv2/src/cudnn/time_stride/stride1_batchsize32.txt +++ /dev/null @@ -1,16 +0,0 @@ -9.5097 -9.18128 -2.11053 -2.51846 -1.3369 -1.0785 -1.07843 -1.04432 -0.675456 -0.665248 -0.677344 -0.661664 -0.634176 -0.64832 -0.623584 -0.625184 diff --git a/ECRv2/src/cudnn/time_stride/stride2_batchsize32.txt b/ECRv2/src/cudnn/time_stride/stride2_batchsize32.txt deleted file mode 100644 index 17a9268..0000000 --- a/ECRv2/src/cudnn/time_stride/stride2_batchsize32.txt +++ /dev/null @@ -1,16 +0,0 @@ -2.41258 -2.78582 -1.14509 -1.09994 -0.693184 -0.698336 -0.68096 -0.792736 -0.718176 -0.673184 -0.625184 -0.630048 -0.633344 -0.604992 -0.608 -0.657824 diff --git a/ECRv2/src/cudnn/time_stride/stride3_batchsize32.txt b/ECRv2/src/cudnn/time_stride/stride3_batchsize32.txt deleted file mode 100644 index c22652d..0000000 --- a/ECRv2/src/cudnn/time_stride/stride3_batchsize32.txt +++ /dev/null @@ -1,16 +0,0 @@ -2.46832 -2.43386 -1.50442 -1.0936 -0.707168 -0.702176 -0.959616 -0.688096 -0.64704 -0.626816 -0.6352 -0.625856 -0.632512 -0.610688 -0.618016 -0.6104 diff --git a/ECRv2/src/cudnn/time_vgg/batchsize32.txt b/ECRv2/src/cudnn/time_vgg/batchsize32.txt deleted file mode 100644 index 1284b5e..0000000 --- a/ECRv2/src/cudnn/time_vgg/batchsize32.txt +++ /dev/null @@ -1,16 +0,0 @@ -7.31648 -7.462711 -2.263564 -2.198837 -1.348295 -1.302553 -1.356063 -1.344357 -1.105748 -1.085286 -1.096859 -1.105219 -1.060116 -1.038973 -1.057504 -1.043154 diff --git a/ECRv2/src/cudnn/time_vgg/cudnn_half.cu b/ECRv2/src/cudnn/time_vgg/cudnn_half.cu deleted file mode 100644 index e2868ba..0000000 --- a/ECRv2/src/cudnn/time_vgg/cudnn_half.cu +++ /dev/null @@ -1,426 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "cuda_fp16.h" - -using namespace std; - -#define CUDA_CALL(f) \ - { \ - cudaError_t err = (f); \ - if (err != cudaSuccess) \ - { \ - std::cout \ - << " Error occurred: " << err << std::endl; \ - std::exit(1); \ - } \ - } - -#define CUDNN_CALL(f) \ - { \ - cudnnStatus_t err = (f); \ - if (err != CUDNN_STATUS_SUCCESS) \ - { \ - std::cout \ - << " Error occurred: " << err << std::endl; \ - std::exit(1); \ - } \ - } - -void print(const float *data, int n, int c, int h, int w) -{ - std::vector buffer(1 << 20); - CUDA_CALL(cudaMemcpy( - buffer.data(), data, - n * c * h * w * sizeof(float), - cudaMemcpyDeviceToHost)); - int a = 0; - for (int i = 0; i < n; ++i) - { - for (int j = 0; j < c; ++j) - { - std::cout << "n=" << i << ", c=" << j << ":" << std::endl; - for (int k = 0; k < h; ++k) - { - for (int l = 0; l < w; ++l) - { - std::cout << std::setw(4) << std::right << buffer[a]; - ++a; - } - std::cout << std::endl; - } - } - } - std::cout << std::endl; -} - -float *LoadKernel(string name, int *&kernel_width, int *&kernel_height, int batch_size, int index) -{ - // ifstream kernel_shape("/home/syt/conv_pool/conv_pool/dataset/kernel/kernel_shape/" + name); - // for (int i = 0; i < 2; i++) - // { - // kernel_shape >> *kernel_width >> *kernel_height; - // } - int shape[49] = {7, - 1, 3, 1, 1, 3, 1, 1, 3, 1, - 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, - 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, - 1, 3, 1, 1, 3, 1, 1, 3, 1}; - *kernel_width = *kernel_height = shape[index]; - float *kernel = new float[*kernel_width * *kernel_height * batch_size]; - for (int i = 0; i < batch_size; i++) - { - ifstream kernel_data("dataset/resnetdataset_all/kernel/" + name); - for (int j = i * (*kernel_width * *kernel_height); j < (i + 1) * (*kernel_width * *kernel_height); j++) - kernel_data >> kernel[j]; - kernel_data.close(); - } - - return kernel; -} - -half *LoadvggKernel(string name, int *&kernel_width, int *&kernel_height, int batch_size) -{ - float temp; - *kernel_width = *kernel_height = 3; - half *kernel = new half[*kernel_width * *kernel_height * batch_size]; - for (int i = 0; i < batch_size; i++) - { - ifstream kernel_data("dataset/vggdata/kernel/" + name +".txt"); - for (int j = i * (*kernel_width * *kernel_height); j < (i + 1) * (*kernel_width * *kernel_height); j++) - { - kernel_data >> temp; - kernel[j]=__float2half(temp); - } - kernel_data.close(); - } - - return kernel; -} - -float *LoadConvWeight(int *&fea_width, int *&fea_height, int batch_size, int index) -{ - string *name = new string[49]; - - ifstream fea_name("dataset/resnetdataset_all/feature_name.txt"); - for (int i = 0; i < 49; i++) - { - fea_name >> name[i]; - } - fea_name.close(); - - // int shape[11] = {224, 112, 56, 56, 56, 28, 28, 28, 14, 14, 14}; - // int shape = 28; - int shape[49] = {224, - 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, - 7, 7, 7, 7, 7, 7, 7}; - - *fea_width = *fea_height = shape[index]; - // *fea_width = *fea_height = shape; - - float *feature = new float[batch_size * *fea_width * *fea_height]; - - for (int i = 0; i < batch_size; i++) - { - ifstream fea_data("dataset/resnetdataset_all/feature/" + name[index]); - for (int j = i * (*fea_width * *fea_height); j < (i + 1) * (*fea_width * *fea_height); j++) - fea_data >> feature[j]; - fea_data.close(); - } - - return feature; -} - -half *LoadVggConvWeigth(int *&fea_width, int *&fea_height, int batch_size, int index) -{ - string *name = new string[16]; - - ifstream fea_name("dataset/vggdata/all_conv_name.txt"); - for (int i = 0; i < 16; i++) - { - fea_name >> name[i]; - } - fea_name.close(); - - // int shape[11] = {224, 112, 56, 56, 56, 28, 28, 28, 14, 14, 14}; - // int shape = 28; - int shape[16] = {224, 224, 112, 112, 56, 56, 56, 56, 28, 28, 28, 28, 14, 14, 14, 14}; - - *fea_width = *fea_height = shape[index]; - // *fea_width = *fea_height = shape; - float temp; - half *feature = new half[batch_size * *fea_width * *fea_height]; - - for (int i = 0; i < batch_size; i++) - { - ifstream fea_data("dataset/vggdata/feature/" + name[index]); - for (int j = i * (*fea_width * *fea_height); j < (i + 1) * (*fea_width * *fea_height); j++) - { - fea_data >> temp; - feature[j]=__float2half(temp); - } - fea_data.close(); - } - - return feature; -} - -float *LoadspConvWeigth(int *&fea_width, int *&fea_height, int batch_size, int index) -{ - string *name = new string[9]; - - ifstream fea_name("sparsity/dataset/sparsity_name.txt"); - for (int i = 0; i < 9; i++) - { - fea_name >> name[i]; - } - fea_name.close(); - - // int shape[11] = {224, 112, 56, 56, 56, 28, 28, 28, 14, 14, 14}; - int shape = 28; - - // *fea_width = *fea_height = shape[index]; - *fea_width = *fea_height = shape; - - float *feature = new float[batch_size * *fea_width * *fea_height]; - - for (int i = 0; i < batch_size; i++) - { - ifstream fea_data("sparsity/dataset/" + name[index]); - for (int j = i * (*fea_width * *fea_height); j < (i + 1) * (*fea_width * *fea_height); j++) - fea_data >> feature[j]; - fea_data.close(); - } - - return feature; -} - -int main(int argc, char *argv[]) -{ - int batch_size = atoi(argv[1]); - // load kernel file name - // string kername_name[49]; - // ifstream open_kernel("/home/syt/conv_pool/conv_pool/dataset/resnetdataset_all/kernel_name.txt"); - string kername_name[16]; - ifstream open_kernel("dataset/vggdata/kernel_name.txt"); - for (int i = 0; i < 16; i++) - { - open_kernel >> kername_name[i]; - } - open_kernel.close(); - - ofstream time_file("ECR/cudnn/time_vgg/batchsize") + argv[1] + string(".txt")); - // ofstream time_file(string("/home/syt/conv_pool/conv_pool/ECR/cudnn/time_gemm/batchsize") + argv[1] + string(".txt")); - - for (int t = 0; t < 16; t++) - { - cudnnHandle_t cudnn; - CUDNN_CALL(cudnnCreate(&cudnn)); - - // input - // int batch_size = 2; - int *fea_width_ = new int; - int *fea_height_ = new int; - half *matrix; - // matrix = LoadConvWeight(fea_width_, fea_height_, batch_size, i); - matrix = LoadVggConvWeigth(fea_width_, fea_height_, batch_size, t); - // matrix = LoadspConvWeigth(fea_width_, fea_height_, batch_size, i); - - const int in_n = batch_size; - const int in_c = 1; - const int in_h = *fea_width_; - const int in_w = *fea_height_; - const int in_size = in_h * in_w * in_c * in_n; - - // cout << "in:" << in_n << " " << in_c << " " << in_h << " " << in_w << endl; - - // filter - // int stride = 1; - int *kernel_width_ = new int; - int *kernel_height_ = new int; - half *kernel; - - // kernel = LoadKernel(kername_name[i], kernel_width_, kernel_height_, batch_size, i); - - kernel = LoadvggKernel(kername_name[t], kernel_width_, kernel_height_, batch_size); - const int filt_k = 1; - const int filt_c = 1; - const int filt_h = *kernel_width_; - const int filt_w = *kernel_height_; - const int file_size = filt_h * filt_w * filt_c * filt_k; - - // cout << "ker:" << filt_k << " " << filt_c << " " << filt_h << " " << filt_w << endl; - - // 记录时间 - cudaEvent_t start, stop; - float elapsed_time = 0.0; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start, 0); - - cudnnTensorDescriptor_t in_desc; - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc)); - CUDNN_CALL(cudnnSetTensor4dDescriptor( - in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, - in_n, in_c, in_h, in_w)); - - half *in_data; - CUDA_CALL(cudaMalloc( - &in_data, in_n * in_c * in_h * in_w * sizeof(float)/2)); - - cudnnFilterDescriptor_t filt_desc; - CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc)); - CUDNN_CALL(cudnnSetFilter4dDescriptor( - filt_desc, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, - filt_k, filt_c, filt_h, filt_w)); - - half *filt_data; - CUDA_CALL(cudaMalloc( - &filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float)/2)); - - // convolution - const int pad_h = 0; - const int pad_w = 0; - const int str_h = 1; - const int str_w = 1; - const int dil_h = 1; - const int dil_w = 1; - - cudnnConvolutionDescriptor_t conv_desc; - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc)); - CUDNN_CALL(cudnnSetConvolution2dDescriptor( - conv_desc, - pad_h, pad_w, str_h, str_w, dil_h, dil_w, - CUDNN_CONVOLUTION, CUDNN_DATA_HALF)); - // cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH); - // output - int out_n; - int out_c; - int out_h; - int out_w; - - CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim( - conv_desc, in_desc, filt_desc, - &out_n, &out_c, &out_h, &out_w)); - // cout << "out:" << out_n << " " << out_c << " " << out_h << " " << out_w << endl; - - cudnnTensorDescriptor_t out_desc; - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc)); - CUDNN_CALL(cudnnSetTensor4dDescriptor( - out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, - out_n, out_c, out_h, out_w)); - - half *out_data; - CUDA_CALL(cudaMalloc( - &out_data, out_n * out_c * out_h * out_w * sizeof(float)/2)); - - // algorithm - cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - - // = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; - // = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD; - // = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - // = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING; - // = CUDNN_CONVOLUTION_FWD_ALGO_GEMM; - - // CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm( - // cudnn, - // in_desc, filt_desc, conv_desc, out_desc, - // CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo)); - - // workspace - size_t ws_size; - CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize( - cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size)); - - half *ws_data; - CUDA_CALL(cudaMalloc(&ws_data, ws_size)); - - // perform - float alpha = 1.0; - float beta = 0.0; - - /* - float input_data[25 * 3] = {0, 0, 15, 8, 0, 22, 0, 0, 23, 0, 0, 8, 0, 0, 0, 0, 0, 19, 0, 0, 10, 4, 0, 0, 22, - 0, 0, 15, 8, 0, 22, 0, 0, 23, 0, 0, 8, 0, 0, 0, 0, 0, 19, 0, 0, 10, 4, 0, 0, 22, - 0, 0, 15, 8, 0, 22, 0, 0, 23, 0, 0, 8, 0, 0, 0, 0, 0, 19, 0, 0, 10, 4, 0, 0, 22}; - float kernel_data[9 * 3] = {0, 1, 0, 1, 0, 1, 0, 1, 0, - 0, 1, 0, 1, 0, 1, 0, 1, 0, - 0, 1, 0, 1, 0, 1, 0, 1, 0}; - */ - - cudaMemcpy(in_data, matrix, in_size * sizeof(float)/2, cudaMemcpyHostToDevice); - cudaMemcpy(filt_data, kernel, file_size * sizeof(float)/2, cudaMemcpyHostToDevice); - - CUDNN_CALL(cudnnConvolutionForward( - cudnn, - &alpha, in_desc, in_data, filt_desc, filt_data, - conv_desc, algo, ws_data, ws_size, - &beta, out_desc, out_data)); - - // results - // std::cout << "in_data:" << std::endl; - // print(in_data, in_n, in_c, in_h, in_w); - - // std::cout << "filt_data:" << std::endl; - // print(filt_data, filt_k, filt_c, filt_h, filt_w); - - // std::cout << "out_data:" << std::endl; - // print(out_data, out_n, out_c, out_h, out_w); - - int result_size = out_n * out_c * out_h * out_w; - half *result = new half[result_size]; - cudaMemcpy(result, out_data, result_size * sizeof(float)/2, cudaMemcpyDeviceToHost); - - // finalizing - CUDA_CALL(cudaFree(ws_data)); - CUDA_CALL(cudaFree(out_data)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc)); - CUDA_CALL(cudaFree(filt_data)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc)); - CUDA_CALL(cudaFree(in_data)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc)); - CUDNN_CALL(cudnnDestroy(cudnn)); - - // 计算时间 - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&elapsed_time, start, stop); - // cout << elapsed_time << endl; - cudaEventDestroy(start); - cudaEventDestroy(stop); - - // 写入时间 - // ofstream time_file("time/time_batchsize1.txt", ios::app); - time_file << elapsed_time << endl; - // time_file.close(); - // 结果写入文件 - // string file_name = "/home/lfa/fsy/syt/conv_pool/ECR/cudnn/output_vgg/output" + to_string(i); - // // string file_name = "/home/syt/conv_pool/conv_pool/ECR/cudnn/out_gemm/output" + to_string(i); - - // ofstream output_file(file_name); - // for (int i = 0; i < result_size; i++) - // { - // output_file << result[i] << " "; - // if ((i + 1) % out_w == 0) - // output_file << "\n"; - // } - // output_file.close(); - // cout << "res:" << result[1] << endl; - memset(result,0,sizeof(result)); - free(result); - } - - time_file.close(); - - return 0; -} \ No newline at end of file diff --git a/ECRv2/times_vgg/singleECR_times.txt b/ECRv2/times_vgg/singleECR_times.txt new file mode 100644 index 0000000..053d857 --- /dev/null +++ b/ECRv2/times_vgg/singleECR_times.txt @@ -0,0 +1,16 @@ +Measured time: 0.136416 +Measured time: 0.986848 +Measured time: 0.540448 +Measured time: 0.542496 +Measured time: 0.334304 +Measured time: 0.338272 +Measured time: 0.376416 +Measured time: 0.340064 +Measured time: 0.239392 +Measured time: 0.243744 +Measured time: 0.24096 +Measured time: 0.237184 +Measured time: 0.224032 +Measured time: 0.238336 +Measured time: 0.246112 +Measured time: 0.1368 diff --git a/dataset/resnet/kernel_name.txt b/dataset/resnet/kernel_name.txt index d913b29..fdac1e0 100644 --- a/dataset/resnet/kernel_name.txt +++ b/dataset/resnet/kernel_name.txt @@ -46,4 +46,4 @@ layer4.1.conv2.weight layer4.1.conv3.weight layer4.2.conv1.weight layer4.2.conv2.weight -layer4.2.conv3.weight \ No newline at end of file +layer4.2.conv3.weight diff --git a/dataset/vggdata/all_conv_name.txt b/dataset/vggdata/all_conv_name.txt index d917be2..94b447b 100644 --- a/dataset/vggdata/all_conv_name.txt +++ b/dataset/vggdata/all_conv_name.txt @@ -13,4 +13,4 @@ feature_26__28_28.txt feature_29__14_14.txt feature_31__14_14.txt feature_33__14_14.txt -feature_35__14_14.txt \ No newline at end of file +feature_35__14_14.txt diff --git a/dataset/vggdata/kernel_name.txt b/dataset/vggdata/kernel_name.txt index 049b660..d67e76e 100644 --- a/dataset/vggdata/kernel_name.txt +++ b/dataset/vggdata/kernel_name.txt @@ -1,16 +1,16 @@ -features.0.weight -features.2.weight -features.5.weight -features.7.weight -features.10.weight -features.12.weight -features.14.weight -features.16.weight -features.19.weight -features.21.weight -features.23.weight -features.25.weight -features.28.weight -features.30.weight -features.32.weight -features.34.weight \ No newline at end of file +features.0.weight.txt +features.2.weight.txt +features.5.weight.txt +features.7.weight.txt +features.10.weight.txt +features.12.weight.txt +features.14.weight.txt +features.16.weight.txt +features.19.weight.txt +features.21.weight.txt +features.23.weight.txt +features.25.weight.txt +features.28.weight.txt +features.30.weight.txt +features.32.weight.txt +features.34.weight.txt diff --git a/speedup/resnet/cuDNN_fast b/speedup/resnet/cuDNN_fast new file mode 100644 index 0000000..e69de29 diff --git a/speedup/resnet/cuDNN_fft b/speedup/resnet/cuDNN_fft new file mode 100644 index 0000000..b1c7874 --- /dev/null +++ b/speedup/resnet/cuDNN_fft @@ -0,0 +1,49 @@ +Measured time: 8.62298 +Measured time: 0.840128 +Measured time: 0.572864 +Measured time: 0.605728 +Measured time: 0.601792 +Measured time: 0.555808 +Measured time: 0.565696 +Measured time: 0.601664 +Measured time: 0.623968 +Measured time: 0.604608 +Measured time: 0.715872 +Measured time: 0.542624 +Measured time: 0.53648 +Measured time: 0.570528 +Measured time: 0.522624 +Measured time: 0.562688 +Measured time: 0.569184 +Measured time: 0.524544 +Measured time: 0.559968 +Measured time: 0.541312 +Measured time: 0.517248 +Measured time: 0.543808 +Measured time: 0.543776 +Measured time: 0.655936 +Measured time: 0.523232 +Measured time: 0.537856 +Measured time: 0.572896 +Measured time: 0.567968 +Measured time: 0.536096 +Measured time: 0.65584 +Measured time: 0.51088 +Measured time: 0.611776 +Measured time: 0.537056 +Measured time: 0.509952 +Measured time: 0.546304 +Measured time: 0.573504 +Measured time: 0.529952 +Measured time: 0.64896 +Measured time: 0.62672 +Measured time: 0.52288 +Measured time: 0.599488 +Measured time: 0.531488 +Measured time: 0.526304 +Measured time: 0.555904 +Measured time: 0.558848 +Measured time: 0.530336 +Measured time: 0.536096 +Measured time: 0.512736 + diff --git a/speedup/resnet/cuDNN_gemm.txt b/speedup/resnet/cuDNN_gemm.txt new file mode 100644 index 0000000..1837be2 --- /dev/null +++ b/speedup/resnet/cuDNN_gemm.txt @@ -0,0 +1,49 @@ +Measured time: 0.591488 +Measured time: 0.556512 +Measured time: 0.600864 +Measured time: 0.521696 +Measured time: 0.561472 +Measured time: 0.51952 +Measured time: 0.541568 +Measured time: 0.52384 +Measured time: 0.648832 +Measured time: 0.839616 +Measured time: 0.542048 +Measured time: 0.542144 +Measured time: 0.600512 +Measured time: 0.51696 +Measured time: 0.522432 +Measured time: 0.60672 +Measured time: 0.517504 +Measured time: 0.505792 +Measured time: 0.51184 +Measured time: 0.520064 +Measured time: 0.520576 +Measured time: 0.506784 +Measured time: 0.560256 +Measured time: 0.511968 +Measured time: 0.571264 +Measured time: 0.746624 +Measured time: 0.548608 +Measured time: 0.654208 +Measured time: 0.645408 +Measured time: 0.521728 +Measured time: 0.524352 +Measured time: 0.52192 +Measured time: 0.512096 +Measured time: 0.522912 +Measured time: 0.577408 +Measured time: 0.545984 +Measured time: 0.704896 +Measured time: 0.526496 +Measured time: 0.519776 +Measured time: 0.574656 +Measured time: 0.503264 +Measured time: 0.540352 +Measured time: 0.534624 +Measured time: 0.52304 +Measured time: 0.536448 +Measured time: 0.602144 +Measured time: 0.52032 +Measured time: 0.52544 + diff --git a/speedup/resnet/cuDNN_imp_gemm.tx b/speedup/resnet/cuDNN_imp_gemm.tx new file mode 100644 index 0000000..6cf77bf --- /dev/null +++ b/speedup/resnet/cuDNN_imp_gemm.tx @@ -0,0 +1,49 @@ +Measured time: 0.514272 +Measured time: 0.669888 +Measured time: 0.526432 +Measured time: 0.521856 +Measured time: 0.58384 +Measured time: 0.53104 +Measured time: 0.572064 +Measured time: 0.54096 +Measured time: 0.524896 +Measured time: 0.537056 +Measured time: 0.567456 +Measured time: 0.541568 +Measured time: 0.53792 +Measured time: 0.528768 +Measured time: 0.511168 +Measured time: 0.572448 +Measured time: 0.531296 +Measured time: 0.566336 +Measured time: 0.598336 +Measured time: 0.501664 +Measured time: 0.538368 +Measured time: 0.529536 +Measured time: 0.525472 +Measured time: 0.589056 +Measured time: 0.592128 +Measured time: 0.573312 +Measured time: 0.615904 +Measured time: 0.598208 +Measured time: 0.536064 +Measured time: 0.520032 +Measured time: 0.57664 +Measured time: 0.543328 +Measured time: 0.663456 +Measured time: 0.825056 +Measured time: 0.624608 +Measured time: 0.528288 +Measured time: 0.604736 +Measured time: 0.582688 +Measured time: 0.508096 +Measured time: 0.589376 +Measured time: 0.54144 +Measured time: 0.517952 +Measured time: 0.584224 +Measured time: 0.535136 +Measured time: 0.5312 +Measured time: 0.530144 +Measured time: 0.57824 +Measured time: 0.708896 + diff --git a/speedup/resnet/singleECR.txt b/speedup/resnet/singleECR.txt new file mode 100644 index 0000000..641e000 --- /dev/null +++ b/speedup/resnet/singleECR.txt @@ -0,0 +1,49 @@ +Measured time: 0.18864 +Measured time: 0.270976 +Measured time: 0.190208 +Measured time: 0.217824 +Measured time: 0.352704 +Measured time: 0.239424 +Measured time: 0.193088 +Measured time: 0.379744 +Measured time: 0.193664 +Measured time: 0.205568 +Measured time: 0.352384 +Measured time: 0.167872 +Measured time: 0.170112 +Measured time: 0.240608 +Measured time: 0.26048 +Measured time: 0.176928 +Measured time: 0.24176 +Measured time: 0.166144 +Measured time: 0.17376 +Measured time: 0.245152 +Measured time: 0.162848 +Measured time: 0.16928 +Measured time: 0.279104 +Measured time: 0.171776 +Measured time: 0.172864 +Measured time: 0.248128 +Measured time: 0.170496 +Measured time: 0.169888 +Measured time: 0.243072 +Measured time: 0.170592 +Measured time: 0.172448 +Measured time: 0.24064 +Measured time: 0.1696 +Measured time: 0.166176 +Measured time: 0.243072 +Measured time: 0.165248 +Measured time: 0.171904 +Measured time: 0.231264 +Measured time: 0.162944 +Measured time: 0.169472 +Measured time: 0.29504 +Measured time: 0.167232 +Measured time: 0.1672 +Measured time: 0.23264 +Measured time: 0.172672 +Measured time: 0.169216 +Measured time: 0.229152 +Measured time: 0.169536 + diff --git a/speedup/resnet/speedup.ipynb b/speedup/resnet/speedup.ipynb index 31527d9..aec7f22 100644 --- a/speedup/resnet/speedup.ipynb +++ b/speedup/resnet/speedup.ipynb @@ -38,16 +38,16 @@ "\n", "\n", "path_tc_c = \"../../ECR/cudnn/time_resnet/batchsize32.txt\"\n", - "path_tc_p = \"../../PECR/cudnn/time_resnet/batchsize32.txt\"\n", + "#path_tc_p = \"../../PECR/cudnn/time_resnet/batchsize32.txt\"\n", "\n", "time_c = pd.read_csv(path_tc_c, header=None, dtype=float)\n", - "time_p = pd.read_csv(path_tc_p, header=None, dtype=float)\n", + "#time_p = pd.read_csv(path_tc_p, header=None, dtype=float)\n", "\n", "time_cudnn = []\n", "time_cudnn.append(time_p.iat[0, 0])\n", "for i in range(0, 47):\n", " time_cudnn.append(time_c.iat[i, 0])\n", - "time_cudnn.append(time_p.iat[1, 0])\n", + "#time_cudnn.append(time_p.iat[1, 0])\n", "\n", "# print(time_cudnn)" ]