Skip to content

Commit

Permalink
blake2s cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
tpruvot committed Mar 13, 2016
1 parent 7ffe65c commit 5a69056
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 53 deletions.
80 changes: 35 additions & 45 deletions Algo256/blake2s.cu
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
/**
* Blake2-S 256 CUDA implementation
* @author tpruvot@github March 2016
*/
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <memory.h>

#include "miner.h"

#define NATIVE_LITTLE_ENDIAN

extern "C" {
#define NATIVE_LITTLE_ENDIAN
#include <sph/blake2s.h>
}

static __thread blake2s_state ALIGN(64) s_midstate;
static __thread blake2s_state ALIGN(64) s_ctx;

//#define GPU_MIDSTATE
#define MIDLEN 76
#define A 64

static __thread blake2s_state ALIGN(A) s_midstate;
static __thread blake2s_state ALIGN(A) s_ctx;

#include "cuda_helper.h"

#ifdef __INTELLISENSE__
Expand Down Expand Up @@ -63,7 +66,7 @@ inline void blake2s_hash_end(uint32_t *output, const uint32_t *input)
}

__host__
void blake2s_cpu_setBlock(uint32_t *penddata, blake2s_state *pstate)
void blake2s_setBlock(uint32_t *penddata, blake2s_state *pstate)
{
#ifndef GPU_MIDSTATE
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, penddata, 80, 0, cudaMemcpyHostToDevice));
Expand All @@ -73,31 +76,18 @@ void blake2s_cpu_setBlock(uint32_t *penddata, blake2s_state *pstate)
}

__device__ __forceinline__
uint32_t gpu_load32(const void *src) {
return *(uint32_t *)(src);
uint64_t gpu_load64(void *src) {
return *(uint64_t*)(src);
}

__device__ __forceinline__
void gpu_store32(void *dst, uint32_t dw) {
*(uint32_t *)(dst) = dw;
*(uint32_t*)(dst) = dw;
}

__device__ __forceinline__
void gpu_store64(void *dst, uint64_t lw) {
*(uint64_t *)(dst) = lw;
}

__device__ __forceinline__
uint64_t gpu_load48(const void *src)
{
const uint8_t *p = (const uint8_t *)src;
uint64_t w = *p++;
w |= (uint64_t)(*p++) << 8;
w |= (uint64_t)(*p++) << 16;
w |= (uint64_t)(*p++) << 24;
w |= (uint64_t)(*p++) << 32;
w |= (uint64_t)(*p++) << 40;
return w;
*(uint64_t*)(dst) = lw;
}

__device__ __forceinline__
Expand Down Expand Up @@ -202,10 +192,11 @@ void gpu_blake2s_compress(blake2s_state *S, const uint32_t *block)
for(int i = 0; i < 8; i++)
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];

#undef G
#undef ROUND
#undef G
#undef ROUND
}

/* unused but kept as reference */
__device__ __forceinline__
void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen)
{
Expand Down Expand Up @@ -236,10 +227,10 @@ void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen)
}

__device__ __forceinline__
void gpu_blake2s_update76(blake2s_state *S, const void *in)
void gpu_blake2s_update76(blake2s_state *S, const void *input)
{
uint64_t *b64 = (uint64_t*) S->buf;
uint64_t *i64 = (uint64_t*) in;
uint64_t *i64 = (uint64_t*) input;
#pragma unroll
for (int i=0; i < 80/8; i++)
b64[i] = i64[i];
Expand Down Expand Up @@ -295,14 +286,14 @@ void gpu_blake2s_init_param(blake2s_state *S, const blake2s_param *P)
S->buflen = 0;

#pragma unroll
for (int i = 0; i < sizeof(S->buf)/4; i++)
gpu_store32(S->buf + (4*i), 0);
for (int i = 0; i < sizeof(S->buf)/8; i++)
gpu_store64(S->buf + (8*i), 0);

uint32_t *p = (uint32_t*) P;
uint64_t *p = (uint64_t*) P;

/* IV XOR ParamBlock */
for (int i = 0; i < 8; i++)
S->h[i] ^= gpu_load32(&p[i]);
for (int i = 0; i < 4; i++)
S->h[i] ^= gpu_load64(&p[i]);
}

// Sequential blake2s initialization
Expand Down Expand Up @@ -373,7 +364,7 @@ void blake2s_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_
}

__host__
uint32_t blake2s_host_hash(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2)
uint32_t blake2s_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2)
{
uint32_t result = UINT32_MAX;

Expand Down Expand Up @@ -406,14 +397,15 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
const uint32_t first_nonce = pdata[19];

int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 26 : 22;
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 28 : 25;
if (device_sm[dev_id] < 350) intensity = 22;

uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

if (opt_benchmark) {
ptarget[7] = swab32(0xff);
ptarget[6] = swab32(0xFFFF0);
ptarget[7] = 0;
}

if (!init[thr_id])
Expand Down Expand Up @@ -442,13 +434,14 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
blake2s_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
memcpy(&s_ctx, &s_midstate, sizeof(blake2s_state));

blake2s_cpu_setBlock(endiandata, &s_midstate);
blake2s_setBlock(endiandata, &s_midstate);

uint2 gpu_target = make_uint2(ptarget[7], ptarget[6]);
const uint32_t Htarg = ptarget[7];
const uint2 target = make_uint2(ptarget[7], ptarget[6]);

do {
uint32_t foundNonce = blake2s_host_hash(thr_id, throughput, pdata[19], gpu_target);
uint32_t foundNonce = blake2s_hash_cuda(thr_id, throughput, pdata[19], target);

*hashes_done = pdata[19] - first_nonce + throughput;

if (foundNonce != UINT32_MAX)
{
Expand All @@ -458,15 +451,14 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
le32enc(&endiandata[19], foundNonce);
blake2s_hash_end(vhashcpu, endiandata);

if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
if (vhashcpu[7] <= target.x && fulltest(vhashcpu, ptarget)) {
work_set_target_ratio(work, vhashcpu);
*hashes_done = pdata[19] + throughput - first_nonce + 1;
pdata[19] = work->nonces[0] = swab32(foundNonce);
#if NBN > 1
if (extra_results[0] != UINT32_MAX) {
le32enc(&endiandata[19], extra_results[0]);
blake2s_hash_end(vhashcpu, endiandata);
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
if (vhashcpu[7] <= target.x && fulltest(vhashcpu, ptarget)) {
work->nonces[1] = swab32(extra_results[0]);
if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio) {
work_set_target_ratio(work, vhashcpu);
Expand All @@ -490,9 +482,7 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc

} while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + pdata[19]);

*hashes_done = pdata[19] - first_nonce + 1;

MyStreamSynchronize(NULL, 0, device_map[thr_id]);
*hashes_done = pdata[19] - first_nonce;

return 0;
}
Expand Down
5 changes: 2 additions & 3 deletions README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,8 @@ features.

>>> RELEASE HISTORY <<<

Mar. 12th 2015 v1.7.5
Blake2S Algo
...
Mar. 13th 2015 v1.7.5
Blake2S Algo (NEVA/OXEN)

Feb. 28th 2015 v1.7.4 (1.7.3 was a preview, not official)
Decred simplified stratum (getwork over stratum)
Expand Down
5 changes: 4 additions & 1 deletion ccminer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1713,6 +1713,9 @@ static void *miner_thread(void *userdata)
// and make an unique work (extradata)
nonceptr[1] += 1;
nonceptr[2] |= thr_id;
} else if (opt_benchmark) {
// randomize work
nonceptr[-1] += 1;
}

pthread_mutex_unlock(&g_work_lock);
Expand Down Expand Up @@ -1826,11 +1829,11 @@ static void *miner_thread(void *userdata)
if (max64 < minmax) {
switch (opt_algo) {
case ALGO_BLAKECOIN:
case ALGO_BLAKE2S:
case ALGO_VANILLA:
minmax = 0x80000000U;
break;
case ALGO_BLAKE:
case ALGO_BLAKE2S:
case ALGO_BMW:
case ALGO_DECRED:
//case ALGO_WHIRLPOOLX:
Expand Down
8 changes: 4 additions & 4 deletions res/ccminer.rc
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico"
//

VS_VERSION_INFO VERSIONINFO
FILEVERSION 1,7,4,0
PRODUCTVERSION 1,7,4,0
FILEVERSION 1,7,5,0
PRODUCTVERSION 1,7,5,0
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x21L
Expand All @@ -76,10 +76,10 @@ BEGIN
BEGIN
BLOCK "040904e4"
BEGIN
VALUE "FileVersion", "1.7.4"
VALUE "FileVersion", "1.7.5"
VALUE "LegalCopyright", "Copyright (C) 2016"
VALUE "ProductName", "ccminer"
VALUE "ProductVersion", "1.7.4"
VALUE "ProductVersion", "1.7.5"
END
END
BLOCK "VarFileInfo"
Expand Down

0 comments on commit 5a69056

Please sign in to comment.