MSE LOSS CUDA

spring·2020년 11월 9일

void CalcMSEWithLoss(float* input, float* loss, float* error, int* y, int N, int MB) {
	thrust::for_each(
		thrust::counting_iterator<int>(0),
		thrust::counting_iterator<int>(N*MB),
		[=] __device__(const int& idx)->void {
		error[idx] = input[idx] - ((idx % N) == y[idx / N]);
        atomicAdd(loss + (idx / N), error[idx] * error[idx]);
	}
	);
}

void CalcMSEWithLoss(float* input, float* loss, float* error, int* y, int N, int MB) {
	thrust::for_each(
		thrust::counting_iterator<int>(0),
		thrust::counting_iterator<int>(N*MB),
		[=] __device__(const int& idx)->void {
		error[idx] = input[idx] - ((idx % N) == y[idx / N]);
	}
	);
    for (int i = 0; i < MB; i++) {
		loss[i]=thrust::transform_reduce(
			thrust::device_pointer_cast(error+(N*i)),
			thrust::device_pointer_cast(error+(N*(i+1))),
			[=] __device__(const float& x)->float {
			return x * x;
		}, 0.f, thrust::plus<float>());
	}
}

void CalcMSEWithLoss(float* input, float* loss, float* error, int* y, int N, int MB) {
	thrust::for_each(
		thrust::counting_iterator<int>(0),
		thrust::counting_iterator<int>(N*MB),
		[=] __device__(const int& idx)->void {
		error[idx] = input[idx] - ((idx % N) == y[idx / N]);
	}
	);
    std::vector<std::future<float>> values(MB);

	for (int i = 0; i < MB; i++) {
		values[i]=std::async(std::launch::async, [=]()->float {
			return thrust::transform_reduce(
				thrust::device_pointer_cast(error + (N*i)),
				thrust::device_pointer_cast(error + (N*(i + 1))),
				[=] __device__(const float& x)->float {
				return x * x;
			}, 0.f, thrust::plus<float>());
		});
	}
	for (int i = 0; i < MB; i++) {
		loss[i] = values[i].get();
	}
}

https://rocthrust.readthedocs.io/en/latest/api/function_group__reductions_1gaa3a63c37c3844d0e84b9d146e6f4c5b8.html

void CalcMSEWithLoss3(float* input, float* loss, float* error, int* y, int N, int MB, int* d_input_keys) {
	thrust::for_each(
		thrust::counting_iterator<int>(0),
		thrust::counting_iterator<int>(N*MB),
		[=] __device__(const int& idx)->void {
		error[idx] = input[idx] - ((idx % N) == y[idx / N]);
	}
	);
	static int* d_output_keys = nullptr;
	static float* d_loss = nullptr;
	static float* d_error2 = nullptr;
	if(!d_output_keys)
		cudaMalloc((void**)&d_output_keys, N*MB * sizeof(int));
	if (!d_loss)
		cudaMalloc((void**)&d_loss, N*MB * sizeof(float));
	if (!d_error2)
		cudaMalloc((void**)&d_error2, N*MB * sizeof(float));

	thrust::transform(
		thrust::device_pointer_cast(error),
		thrust::device_pointer_cast(error + N * MB),
		thrust::device_pointer_cast(d_error2),
		[=]__device__(float x)->float {return x * x; });

	auto new_end = thrust::reduce_by_key(
		thrust::device_pointer_cast(d_input_keys),
		thrust::device_pointer_cast(d_input_keys + N * MB),
		thrust::device_pointer_cast(d_error2),
		thrust::device_pointer_cast(d_output_keys),
		thrust::device_pointer_cast(d_loss));
	cudaMemcpy(loss, d_loss, MB * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);
}

spring

Researcher & Developer @ NAVER Corp | Designer @ HONGIK Univ.

이전 포스트

libtorch

다음 포스트

MSE LOSS CUDA

libtorch

MSE LOSS CUDA

0개의 댓글

관련 채용 정보