void CalcMSEWithLoss(float* input, float* loss, float* error, int* y, int N, int MB) {
thrust::for_each(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(N*MB),
[=] __device__(const int& idx)->void {
error[idx] = input[idx] - ((idx % N) == y[idx / N]);
atomicAdd(loss + (idx / N), error[idx] * error[idx]);
}
);
}
void CalcMSEWithLoss(float* input, float* loss, float* error, int* y, int N, int MB) {
thrust::for_each(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(N*MB),
[=] __device__(const int& idx)->void {
error[idx] = input[idx] - ((idx % N) == y[idx / N]);
}
);
for (int i = 0; i < MB; i++) {
loss[i]=thrust::transform_reduce(
thrust::device_pointer_cast(error+(N*i)),
thrust::device_pointer_cast(error+(N*(i+1))),
[=] __device__(const float& x)->float {
return x * x;
}, 0.f, thrust::plus<float>());
}
}
void CalcMSEWithLoss(float* input, float* loss, float* error, int* y, int N, int MB) {
thrust::for_each(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(N*MB),
[=] __device__(const int& idx)->void {
error[idx] = input[idx] - ((idx % N) == y[idx / N]);
}
);
std::vector<std::future<float>> values(MB);
for (int i = 0; i < MB; i++) {
values[i]=std::async(std::launch::async, [=]()->float {
return thrust::transform_reduce(
thrust::device_pointer_cast(error + (N*i)),
thrust::device_pointer_cast(error + (N*(i + 1))),
[=] __device__(const float& x)->float {
return x * x;
}, 0.f, thrust::plus<float>());
});
}
for (int i = 0; i < MB; i++) {
loss[i] = values[i].get();
}
}
https://rocthrust.readthedocs.io/en/latest/api/function_group__reductions_1gaa3a63c37c3844d0e84b9d146e6f4c5b8.html
void CalcMSEWithLoss3(float* input, float* loss, float* error, int* y, int N, int MB, int* d_input_keys) {
thrust::for_each(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(N*MB),
[=] __device__(const int& idx)->void {
error[idx] = input[idx] - ((idx % N) == y[idx / N]);
}
);
static int* d_output_keys = nullptr;
static float* d_loss = nullptr;
static float* d_error2 = nullptr;
if(!d_output_keys)
cudaMalloc((void**)&d_output_keys, N*MB * sizeof(int));
if (!d_loss)
cudaMalloc((void**)&d_loss, N*MB * sizeof(float));
if (!d_error2)
cudaMalloc((void**)&d_error2, N*MB * sizeof(float));
thrust::transform(
thrust::device_pointer_cast(error),
thrust::device_pointer_cast(error + N * MB),
thrust::device_pointer_cast(d_error2),
[=]__device__(float x)->float {return x * x; });
auto new_end = thrust::reduce_by_key(
thrust::device_pointer_cast(d_input_keys),
thrust::device_pointer_cast(d_input_keys + N * MB),
thrust::device_pointer_cast(d_error2),
thrust::device_pointer_cast(d_output_keys),
thrust::device_pointer_cast(d_loss));
cudaMemcpy(loss, d_loss, MB * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);
}