Skip to content

Commit

Permalink
OpenCL: Various tweaks, mainly to shared code
Browse files Browse the repository at this point in the history
Optimizations and clean up of shared OpenCL helper macros and shared
code and consequently all their callers. Also a few bug fixes/tweaks
to affected formats that had issues during testíng.
  • Loading branch information
magnumripper committed Feb 13, 2025
1 parent 3ebd3a5 commit 1164f14
Show file tree
Hide file tree
Showing 10 changed files with 179 additions and 188 deletions.
19 changes: 12 additions & 7 deletions run/opencl/gost12256hash_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,6 @@
*/

#include "opencl_misc.h"

#define STREEBOG256CRYPT 1
#if gpu(DEVICE_INFO)
#define STREEBOG_LOCAL_AX 1
#endif
#define STREEBOG_VECTOR 1
#define STREEBOG_UNROLL 0
#include "opencl_streebog.h"

#define SALT_LENGTH 16
Expand Down Expand Up @@ -60,6 +53,10 @@ __kernel void gost12256init(__global inbuf *in,
uint lid = get_local_id(0);

for (uint i = lid; i < 256; i += ls) {
#if STREEBOG_LOCAL_C
if (i < 12)
loc_buf->C[i].VWORD = C[i].VWORD;
#endif
for (uint j = 0; j < 8; j++)
loc_buf->Ax[j][i] = Ax[j][i];
}
Expand Down Expand Up @@ -195,6 +192,10 @@ __kernel void gost12256loop(__global inbuf *in,
uint lid = get_local_id(0);

for (uint i = lid; i < 256; i += ls) {
#if STREEBOG_LOCAL_C
if (i < 12)
loc_buf->C[i].VWORD = C[i].VWORD;
#endif
for (uint j = 0; j < 8; j++)
loc_buf->Ax[j][i] = Ax[j][i];
}
Expand Down Expand Up @@ -262,6 +263,10 @@ __kernel void gost12256final(__global inbuf *in,
uint lid = get_local_id(0);

for (uint i = lid; i < 256; i += ls) {
#if STREEBOG_LOCAL_C
if (i < 12)
loc_buf->C[i].VWORD = C[i].VWORD;
#endif
for (uint j = 0; j < 8; j++)
loc_buf->Ax[j][i] = Ax[j][i];
}
Expand Down
19 changes: 13 additions & 6 deletions run/opencl/gost12512hash_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,7 @@
*/

#include "opencl_misc.h"
#define STREEBOG512CRYPT 1
#if gpu(DEVICE_INFO)
#define STREEBOG_LOCAL_AX 1
#endif
#define STREEBOG_VECTOR 1
#define STREEBOG_UNROLL 0
#define STREEBOG512 1
#include "opencl_streebog.h"

#define SALT_LENGTH 16
Expand Down Expand Up @@ -59,6 +54,10 @@ __kernel void gost12512init(__global inbuf *in,
uint lid = get_local_id(0);

for (uint i = lid; i < 256; i += ls) {
#if STREEBOG_LOCAL_C
if (i < 12)
loc_buf->C[i].VWORD = C[i].VWORD;
#endif
for (uint j = 0; j < 8; j++)
loc_buf->Ax[j][i] = Ax[j][i];
}
Expand Down Expand Up @@ -194,6 +193,10 @@ __kernel void gost12512loop(__global inbuf *in,
uint lid = get_local_id(0);

for (uint i = lid; i < 256; i += ls) {
#if STREEBOG_LOCAL_C
if (i < 12)
loc_buf->C[i].VWORD = C[i].VWORD;
#endif
for (uint j = 0; j < 8; j++)
loc_buf->Ax[j][i] = Ax[j][i];
}
Expand Down Expand Up @@ -261,6 +264,10 @@ __kernel void gost12512final(__global inbuf *in,
uint lid = get_local_id(0);

for (uint i = lid; i < 256; i += ls) {
#if STREEBOG_LOCAL_C
if (i < 12)
loc_buf->C[i].VWORD = C[i].VWORD;
#endif
for (uint j = 0; j < 8; j++)
loc_buf->Ax[j][i] = Ax[j][i];
}
Expand Down
64 changes: 28 additions & 36 deletions run/opencl/opencl_streebog.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
/*
* Porting to OpenCL + optimizations: Copyright (c) 2022-2025 magnum, and those
* changes hereby released to the general public under the following terms:
* Redistribution and use in source and binary forms, with or without
* modification, are permitted.
*
* Copyright (c) 2013, Alexey Degtyarev <[email protected]>.
* All rights reserved.
*
Expand All @@ -22,24 +27,33 @@
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* Porting to OpenCL + optimizations: Copyright (c) 2022 magnum, and those changes
* hereby released to the general public under the following terms:
* Redistribution and use in source and binary forms, with or without
* modification, are permitted.
*/

#ifndef _OPENCL_STREEBOG_H
#define _OPENCL_STREEBOG_H

#include "opencl_misc.h"

#if gpu(DEVICE_INFO)
#define STREEBOG_LOCAL_AX 1
#define STREEBOG_LOCAL_C 0
#endif

#define STREEBOG_VECTOR 1
#define STREEBOG_UNROLL 0

#if STREEBOG_LOCAL_AX
#define AX loc_buf->Ax
#else
#define AX Ax
#endif

#if STREEBOG_LOCAL_C
#define CC loc_buf->C
#else
#define CC C
#endif

#if STREEBOG_UNROLL
#define UNROLL8 _Pragma("unroll 8")
#define UNROLL16 _Pragma("unroll 16")
Expand All @@ -48,29 +62,25 @@
#define UNROLL16
#endif

#if STREEBOG512CRYPT && !defined STREEBOG512
#define STREEBOG512 1
#endif

#if STREEBOG_VECTOR
#define memcpy512(dst, src) do { \
(dst)->VWORD = (src)->VWORD; \
} while (0)
#define memcpy256 memcpy512
#else
#define memcpy512(dst, src) do { \
UNROLL16 \
for (uint i = 0; i < 16; i++) \
(dst)->DWORD[i] = (src)->DWORD[i]; \
} while (0)
#endif

#define memcpy256(dst, src) do { \
UNROLL8 \
for (uint i = 0; i < 8; i++) \
(dst)->DWORD[i] = (src)->DWORD[i]; \
} while (0)
#endif

/* QWORD on top because declarations depend on it. */
typedef union {
ulong QWORD[4];
uint DWORD[8];
Expand All @@ -91,12 +101,10 @@ typedef struct {
uint512_u N;
uint512_u Sigma;
uint bufsize;
#if !STREEBOG512CRYPT && !STREEBOG256CRYPT
uint digest_size;
#endif
} GOST34112012Context;

typedef struct {
uint512_u C[12];
ulong Ax[8][256];
} localbuf;

Expand Down Expand Up @@ -149,7 +157,7 @@ typedef struct {
} while (0)

#define ROUND(i, Ki, data) do { \
XLPS(Ki, &C[i], Ki); \
XLPS(Ki, &CC[i], Ki); \
XLPS(Ki, data, data); \
} while (0)

Expand Down Expand Up @@ -739,17 +747,10 @@ INLINE void
GOST34112012Init(GOST34112012Context *CTX, const uint digest_size)
{
CTX->buffer.VWORD = 0;
#if STREEBOG256CRYPT
CTX->h.VWORD = 0x01010101U;
#elif STREEBOG512CRYPT
#if STREEBOG512
CTX->h.VWORD = 0;
#else
CTX->digest_size = digest_size;

if (digest_size == 256)
CTX->h.VWORD = 0x01010101U;
else
CTX->h.VWORD = 0;
CTX->h.VWORD = 0x01010101U;
#endif
CTX->N.VWORD = 0;
CTX->Sigma.VWORD = 0;
Expand Down Expand Up @@ -852,7 +853,6 @@ g0(uint512_u *h, const uint512_u *m, __local localbuf *loc_buf)
for (uint i = 0; i < 11; i++)
ROUND(i, &Ki, &data);
#endif

XLPS(&Ki, &C[11], &Ki);
XOR512(&Ki, &data, &data);
/* E() done */
Expand Down Expand Up @@ -942,9 +942,7 @@ GOST34112012Update(GOST34112012Context *CTX, const uchar *data, uint len, __loca
}

static NOINLINE void
GOST34112012Final(GOST34112012Context *CTX,
void *_digest,
__local localbuf *loc_buf)
GOST34112012Final(GOST34112012Context *CTX, void *_digest, __local localbuf *loc_buf)
{
#if STREEBOG512
uint512_u *digest = _digest;
Expand All @@ -955,17 +953,11 @@ GOST34112012Final(GOST34112012Context *CTX,

CTX->bufsize = 0;

#if STREEBOG512CRYPT
#if STREEBOG512
memcpy512(digest, &(CTX->h));
#elif STREEBOG256CRYPT
#else
for (uint i = 0; i < 8; i++)
digest->DWORD[i] = CTX->h.DWORD[8 + i];
#else
if (CTX->digest_size == 256)
for (uint i = 0; i < 8; i++)
digest->DWORD[i] = CTX->h.DWORD[8 + i];
else
memcpy512(digest, &(CTX->h));
#endif
}

Expand Down
16 changes: 8 additions & 8 deletions src/opencl_argon2_fmt_plug.c
Original file line number Diff line number Diff line change
Expand Up @@ -307,10 +307,10 @@ static void done(void)

if (argon2_self == &fmt_opencl_keepass_argon2) {
if (keepass_outbuffer) {
RELEASEBUFFER(cl_keepass_in);
RELEASEBUFFER(cl_keepass_salt);
RELEASEBUFFER(cl_keepass_state);
RELEASEBUFFER(cl_keepass_out);
CLRELEASEBUFFER(cl_keepass_in);
CLRELEASEBUFFER(cl_keepass_salt);
CLRELEASEBUFFER(cl_keepass_state);
CLRELEASEBUFFER(cl_keepass_out);

MEM_FREE(keepass_inbuffer);
MEM_FREE(keepass_statebuffer);
Expand Down Expand Up @@ -644,9 +644,9 @@ static void reset(struct db_main *db)

opencl_init("$JOHN/opencl/keepass_kernel.cl", gpu_id, build_opts);

CREATEKERNEL(keepass_init, "keepass_init");
//CREATEKERNEL(keepass_argon2, "keepass_argon2");
CREATEKERNEL(keepass_final, "keepass_final");
CLCREATEKERNEL(keepass_init, "keepass_init");
//CLCREATEKERNEL(keepass_argon2, "keepass_argon2");
CLCREATEKERNEL(keepass_final, "keepass_final");
}
}

Expand Down Expand Up @@ -989,7 +989,7 @@ static void kp_set_salt(void *salt)
memcpy(&saved_salt, &keepass_salt->t_cost, sizeof(struct argon2_salt));

if (sizeof(keepass_salt_t) + keepass_salt->content_size - 1 > keepass_saltsize) {
RELEASEBUFFER(cl_keepass_salt);
CLRELEASEBUFFER(cl_keepass_salt);
keepass_saltsize = sizeof(keepass_salt_t) + keepass_salt->content_size - 1;
CLCREATEBUFFER(cl_keepass_salt, CL_RO, keepass_saltsize);
CLKERNELARG(keepass_init, 1, cl_keepass_salt);
Expand Down
43 changes: 18 additions & 25 deletions src/opencl_gost12256hash_fmt_plug.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ john_register_one(&fmt_opencl_cryptstreebog256);
#include "formats.h"
#include "options.h"
#include "opencl_common.h"
#include "opencl_helper_macros.h"

#define FORMAT_LABEL "streebog256crypt-opencl"
#define FORMAT_NAME "Astra Linux $gost12256hash$"
Expand Down Expand Up @@ -90,6 +91,7 @@ typedef struct {
} statebuf;

typedef struct {
uint64_t C[8][12];
uint64_t Ax[8][256];
} localbuf;

Expand All @@ -99,7 +101,6 @@ typedef struct {
#define ITERATIONS 5004

static inbuf *inbuffer;
static cl_int cl_error;
static cl_mem mem_in, mem_out, mem_salt, mem_state;
static cl_kernel init_kernel, final_kernel;
static int new_keys;
Expand All @@ -116,33 +117,17 @@ static int split_events[] = { 2, -1, -1 };

static void release_clobj(void);

#define CL_RO CL_MEM_READ_ONLY
#define CL_WO CL_MEM_WRITE_ONLY
#define CL_RW CL_MEM_READ_WRITE

#define CLCREATEBUFFER(_flags, _size) \
clCreateBuffer(context[gpu_id], _flags, _size, NULL, &cl_error); \
HANDLE_CLERROR(cl_error, "Error allocating GPU memory");

#define CLKERNELARG(kernel, id, arg) \
HANDLE_CLERROR(clSetKernelArg(kernel, id, sizeof(arg), &arg), \
"Error setting kernel argument");

#define CLKRNARGLOC(kernel, id, arg) \
HANDLE_CLERROR(clSetKernelArg(kernel, id, sizeof(arg), NULL), \
"Error setting kernel argument");

static void create_clobj(size_t gws, struct fmt_main *self)
{
release_clobj();

inbuffer = mem_calloc(gws, sizeof(inbuf));
crypt_out = mem_calloc(gws, sizeof(outbuf));

mem_in = CLCREATEBUFFER(CL_RO, gws * sizeof(inbuf));
mem_out = CLCREATEBUFFER(CL_WO, gws * sizeof(outbuf));
mem_state = CLCREATEBUFFER(CL_RW, gws * sizeof(statebuf));
mem_salt = CLCREATEBUFFER(CL_RO, sizeof(saltstruct));
CLCREATEBUFFER(mem_in, CL_RO, gws * sizeof(inbuf));
CLCREATEBUFFER(mem_out, CL_WO, gws * sizeof(outbuf));
CLCREATEBUFFER(mem_state, CL_RW, gws * sizeof(statebuf));
CLCREATEBUFFER(mem_salt, CL_RO, sizeof(saltstruct));

HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_in, CL_TRUE, 0, gws * sizeof(inbuf), inbuffer, 0, NULL, NULL), "Copy data to gpu");
HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_salt, CL_TRUE, 0, sizeof(saltstruct), &cur_salt, 0, NULL, NULL), "Salt transfer");
Expand Down Expand Up @@ -194,6 +179,9 @@ static void init(struct fmt_main *_self)
self = _self;

opencl_prepare_dev(gpu_id);

/* Work around autotune miss */
local_work_size = 256;
}

static void reset(struct db_main *db)
Expand Down Expand Up @@ -317,15 +305,20 @@ static int crypt_all(int *pcount, struct db_salt *salt)
}

// Run kernel
BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], init_kernel, 1, NULL, &gws, lws, 0, NULL, multi_profilingEvent[1]), "Run initial kernel");
CLRUNKERNEL(init_kernel, &gws, lws, multi_profilingEvent[1]);

uint loops = (ocl_autotune_running ? 1 : cur_salt.rounds / HASH_LOOPS);
WAIT_INIT(gws)
for (index = 0; index < loops; index++) {
BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1, NULL, &gws, lws, 0, NULL, multi_profilingEvent[2]), "failed in clEnqueueNDRangeKernel");
BENCH_CLERROR(clFinish(queue[gpu_id]), "Error running loop kernel");
CLRUNKERNEL(crypt_kernel, &gws, lws, multi_profilingEvent[2]);
WAIT_SLEEP
CLFINISH();
WAIT_UPDATE
opencl_process_event();
}
BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], final_kernel, 1, NULL, &gws, lws, 0, NULL, multi_profilingEvent[3]), "failed in clEnqueueNDRangeKernel");
WAIT_DONE

CLRUNKERNEL(final_kernel, &gws, lws, multi_profilingEvent[3]);

// Read the result back
BENCH_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_TRUE, 0, gws * sizeof(outbuf), crypt_out, 0, NULL, multi_profilingEvent[4]), "Copy result back");
Expand Down
Loading

0 comments on commit 1164f14

Please sign in to comment.