40 #include <cuda_runtime.h> 41 #include <cublas_v2.h> 44 #define CUDAERR(x) do { if((x)!=cudaSuccess) { \ 45 printf("CUDA error: %s : %s, line %d\n", cudaGetErrorString(x), __FILE__, __LINE__);\ 46 return EXIT_FAILURE;}} while(0) 48 #define CUBLASERR(x) do { if((x)!=CUBLAS_STATUS_SUCCESS) { \ 49 printf("CUBLAS error: %s, line %d\n", __FILE__, __LINE__);\ 50 return EXIT_FAILURE;}} while(0) 52 #define max(a,b) (((a)>(b ))?( a):(b)) 53 #define min(a,b) (((a)<(b ))?( a):(b)) 55 #define fpe(x) (isnan(x) || isinf(x)) 58 __global__
void vdmemset_cuda(
const int n,
double *x,
const double val);
59 __global__
void vsmemset_cuda(
const int n,
float *x,
const float val);
61 __global__
void vddiv_cuda(
const int n,
const double* __restrict__ x,
const double* __restrict__ y,
double *z);
62 __global__
void vsdiv_cuda(
const int n,
const float* __restrict__ x,
const float* __restrict__ y,
float *z);
64 __global__
void vdsub_cuda(
const int n,
const double* __restrict__ x,
double *y);
65 __global__
void vssub_cuda(
const int n,
const float* __restrict__ x,
float *y);
67 __global__
void vderrorbd0_cuda(
const int n,
const double* __restrict__ x,
double *y);
68 __global__
void vserrorbd0_cuda(
const int n,
const float* __restrict__ x,
float *y);
70 __global__
void vderrorbd1_cuda(
const int n,
const double* __restrict__ x,
double *y);
71 __global__
void vserrorbd1_cuda(
const int n,
const float* __restrict__ x,
float *y);
73 __global__
void vderrorbdg_cuda(
const int n,
const double* __restrict__ x,
double *y,
const double beta);
74 __global__
void vserrorbdg_cuda(
const int n,
const float* __restrict__ x,
float *y,
const double beta);
78 void dmemset_cuda(
const int n,
double *x,
const double val, cudaStream_t stream);
79 void smemset_cuda(
const int n,
float *x,
const float val, cudaStream_t stream);
81 void ddiv_cuda(
const int n,
const double *x,
double *y, cudaStream_t stream);
82 void sdiv_cuda(
const int n,
const float *x,
float *y, cudaStream_t stream);
84 void dsub_cuda(
const int n,
const double *x,
double *y);
85 void ssub_cuda(
const int n,
const float *x,
float *y);
87 void dlarngenn_cuda(
const int m,
const int n,
const int seed,
double *x);
88 void slarngenn_cuda(
const int m,
const int n,
const int seed,
float *x);
90 double derror_cuda(
const int m,
const int n,
const int k,
const double *x,
const double *y,
const double *z);
91 float serror_cuda(
const int m,
const int n,
const int k,
const float *x,
const float *y,
const float *z);
93 double derrorbd_cuda(
const int m,
const int n,
const int k,
const double *A,
const double *W,
const double *H,
const double beta);
94 float serrorbd_cuda(
const int m,
const int n,
const int k,
const float *A,
const float *W,
const float *H,
const float beta);
void smemset_cuda(const int n, float *x, const float val, cudaStream_t stream)
void ssub_cuda(const int n, const float *x, float *y)
__global__ void vserrorbdg_cuda(const int n, const float *__restrict__ x, float *y, const double beta)
__global__ void vdmemset_cuda(const int n, double *x, const double val)
void dmemset_cuda(const int n, double *x, const double val, cudaStream_t stream)
__global__ void vdsub_cuda(const int n, const double *__restrict__ x, double *y)
float serror_cuda(const int m, const int n, const int k, const float *x, const float *y, const float *z)
__global__ void vserrorbd1_cuda(const int n, const float *__restrict__ x, float *y)
float serrorbd_cuda(const int m, const int n, const int k, const float *A, const float *W, const float *H, const float beta)
void dsub_cuda(const int n, const double *x, double *y)
__global__ void vderrorbdg_cuda(const int n, const double *__restrict__ x, double *y, const double beta)
double derror_cuda(const int m, const int n, const int k, const double *x, const double *y, const double *z)
void slarngenn_cuda(const int m, const int n, const int seed, float *x)
__global__ void vssub_cuda(const int n, const float *__restrict__ x, float *y)
__global__ void vserrorbd0_cuda(const int n, const float *__restrict__ x, float *y)
__global__ void vderrorbd0_cuda(const int n, const double *__restrict__ x, double *y)
__global__ void vsdiv_cuda(const int n, const float *__restrict__ x, const float *__restrict__ y, float *z)
double derrorbd_cuda(const int m, const int n, const int k, const double *A, const double *W, const double *H, const double beta)
__global__ void vsmemset_cuda(const int n, float *x, const float val)
void ddiv_cuda(const int n, const double *x, double *y, cudaStream_t stream)
__global__ void vddiv_cuda(const int n, const double *__restrict__ x, const double *__restrict__ y, double *z)
void sdiv_cuda(const int n, const float *x, float *y, cudaStream_t stream)
__global__ void vderrorbd1_cuda(const int n, const double *__restrict__ x, double *y)
void dlarngenn_cuda(const int m, const int n, const int seed, double *x)