41
41
#include < cstdio>
42
42
#include < cuda_runtime_api.h>
43
43
44
- #ifdef DEBUG
45
- #define D_PRINTF printf
46
- #else
47
- #define D_PRINTF (...)
48
- #endif
44
+ extern volatile int log_level;
45
+ #define LOG_LEVEL_DEBUG 7
49
46
50
47
#define MEASURE_KERNEL_DURATION_START (stream ) \
51
48
cudaEvent_t t0, t1; \
57
54
cudaEventSynchronize (t1); \
58
55
float elapsedTime = NAN; \
59
56
cudaEventElapsedTime (&elapsedTime, t0, t1); \
60
- D_PRINTF (" %s elapsed time: %f ms\n " , __func__, elapsedTime); \
57
+ if (log_level >= LOG_LEVEL_DEBUG) { \
58
+ printf (" %s elapsed time: %f ms\n " , __func__, elapsedTime); \
59
+ } \
61
60
if (elapsedTime > 10.0 ) { \
62
61
fprintf ( \
63
62
stderr, \
73
72
/* *
74
73
* modified @ref vc_copylineRG48toR12L
75
74
*/
75
+ template <typename load_t >
76
76
__device__ static void
77
- rt48_to_r12l_compute_blk (const uint8_t *src , uint8_t *dst )
77
+ rt48_to_r12l_compute_blk (const uint8_t *in , uint8_t *out )
78
78
{
79
+ // load the data from in to src_u32
80
+ auto *in_t = (load_t *) in;
81
+ uint32_t src_u32[12 ];
82
+ for (unsigned i = 0 ; i < sizeof src_u32 / sizeof src_u32[0 ]; ++i) {
83
+ static_assert (sizeof (load_t ) == 2 || sizeof (load_t ) == 4 );
84
+ if constexpr (sizeof (load_t ) == 4 ) {
85
+ src_u32[i] = in_t [i];
86
+ } else {
87
+ src_u32[i] = in_t [2 * i] | in_t [2 * i + 1 ] << 16 ;
88
+ }
89
+ }
90
+
91
+ uint32_t dst_u32[9 ];
92
+ auto *dst = (uint8_t *) dst_u32;
93
+ auto *src = (uint8_t *) src_u32;
94
+
79
95
// 0
80
96
dst[0 ] = src[0 ] >> 4 ;
81
97
dst[0 ] |= src[1 ] << 4 ;
@@ -191,21 +207,29 @@ rt48_to_r12l_compute_blk(const uint8_t *src, uint8_t *dst)
191
207
dst[32 + 2 ] |= src[0 ] & 0xF0 ;
192
208
dst[32 + 3 ] = src[1 ];
193
209
src += 2 ;
210
+
211
+ // store the result
212
+ auto *out_u32 = (uint32_t *) out;
213
+ for (unsigned i = 0 ; i < sizeof dst_u32 / sizeof dst_u32[0 ]; ++i) {
214
+ out_u32[i] = dst_u32[i];
215
+ }
194
216
}
195
217
218
+ template <typename load_t >
196
219
__device__ static void
197
220
rt48_to_r12l_compute_last_blk (uint8_t *src, uint8_t *dst, unsigned width)
198
221
{
199
- uint8_t tmp[48 ];
222
+ alignas ( uint32_t ) uint8_t tmp[48 ];
200
223
for (unsigned i = 0 ; i < width * 6 ; ++i) {
201
224
tmp[i] = src[i];
202
225
}
203
- rt48_to_r12l_compute_blk (tmp, dst);
226
+ rt48_to_r12l_compute_blk< load_t > (tmp, dst);
204
227
}
205
228
206
229
/* *
207
230
* @todo fix the last block for widths not divisible by 8
208
231
*/
232
+ template <typename load_t >
209
233
__global__ static void
210
234
kernel_rg48_to_r12l (uint8_t *in, uint8_t *out, unsigned size_x)
211
235
{
@@ -220,11 +244,11 @@ kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x)
220
244
221
245
// handle incomplete blocks
222
246
if (position_x == size_x / 8 ) {
223
- rt48_to_r12l_compute_last_blk (src, dst,
224
- size_x - position_x * 8 );
247
+ rt48_to_r12l_compute_last_blk< load_t > (src, dst,
248
+ size_x - position_x * 8 );
225
249
return ;
226
250
}
227
- rt48_to_r12l_compute_blk (src, dst);
251
+ rt48_to_r12l_compute_blk< load_t > (src, dst);
228
252
}
229
253
230
254
/* *
@@ -252,9 +276,24 @@ int postprocess_rg48_to_r12l(
252
276
253
277
MEASURE_KERNEL_DURATION_START (stream)
254
278
255
- kernel_rg48_to_r12l<<<blocks, threads_per_block, 0 ,
256
- (cudaStream_t) stream>>> (
257
- (uint8_t *) input_samples, (uint8_t *) output_buffer, size_x);
279
+ if (size_x % 2 == 0 ) {
280
+ kernel_rg48_to_r12l<uint32_t >
281
+ <<<blocks, threads_per_block, 0 , (cudaStream_t) stream>>> (
282
+ (uint8_t *) input_samples, (uint8_t *) output_buffer,
283
+ size_x);
284
+ } else {
285
+ thread_local bool warn_print;
286
+ if (!warn_print) {
287
+ fprintf (stderr,
288
+ " %s: Odd width %d px will use slower kernel!\n " ,
289
+ __func__, size_x);
290
+ warn_print = true ;
291
+ }
292
+ kernel_rg48_to_r12l<uint16_t >
293
+ <<<blocks, threads_per_block, 0 , (cudaStream_t) stream>>> (
294
+ (uint8_t *) input_samples, (uint8_t *) output_buffer,
295
+ size_x);
296
+ }
258
297
259
298
MEASURE_KERNEL_DURATION_STOP (stream)
260
299
@@ -266,9 +305,11 @@ int postprocess_rg48_to_r12l(
266
305
// / , _/ / / / __/ / /__/___/ > > / , _// (_ / /_ _// _ |
267
306
// /_/|_| /_/ /____/ /____/ /_/ /_/|_| \___/ /_/ \___/
268
307
308
+ template <typename store_t >
269
309
__device__ static void r12l_to_rg48_compute_blk (const uint8_t *src,
270
310
uint8_t *dst);
271
311
312
+ template <typename store_t >
272
313
__global__ static void
273
314
kernel_r12l_to_rg48 (uint8_t *in, uint8_t *out, unsigned size_x)
274
315
{
@@ -283,20 +324,32 @@ kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x)
283
324
284
325
if (position_x == size_x / 8 ) {
285
326
// compute the last incomplete block
286
- uint8_t tmp[48 ];
287
- r12l_to_rg48_compute_blk (src, tmp);
327
+ alignas ( uint32_t ) uint8_t tmp[48 ];
328
+ r12l_to_rg48_compute_blk< store_t > (src, tmp);
288
329
for (unsigned i = 0 ; i < (size_x - position_x * 8 ) * 6 ; ++i) {
289
330
dst[i] = tmp[i];
290
331
}
291
332
return ;
292
333
}
293
- r12l_to_rg48_compute_blk (src, dst);
334
+ r12l_to_rg48_compute_blk< store_t > (src, dst);
294
335
}
295
336
296
337
// / adapted variant of @ref vc_copylineR12LtoRG48
338
+ template <typename store_t >
297
339
__device__ static void
298
- r12l_to_rg48_compute_blk (const uint8_t *src , uint8_t *dst )
340
+ r12l_to_rg48_compute_blk (const uint8_t *in , uint8_t *out )
299
341
{
342
+ // load the data from in to src_u32
343
+ auto *in_u32 = (uint32_t *) in;
344
+ uint32_t src_u32[9 ];
345
+ for (unsigned i = 0 ; i < sizeof src_u32 / sizeof src_u32[0 ]; ++i) {
346
+ src_u32[i] = in_u32[i];
347
+ }
348
+
349
+ uint32_t dst_u32[12 ];
350
+ uint8_t *dst = (uint8_t *) dst_u32;
351
+ uint8_t *src = (uint8_t *) src_u32;
352
+
300
353
// 0
301
354
// R
302
355
*dst++ = src[0 ] << 4 ;
@@ -377,6 +430,18 @@ r12l_to_rg48_compute_blk(const uint8_t *src, uint8_t *dst)
377
430
378
431
*dst++ = src[32 + 2 ] & 0xF0 ;
379
432
*dst++ = src[32 + 3 ];
433
+
434
+ // store the result
435
+ auto *out_t = (store_t *) out;
436
+ for (unsigned i = 0 ; i < sizeof dst_u32 / sizeof dst_u32[0 ]; ++i) {
437
+ static_assert (sizeof (store_t ) == 2 || sizeof (store_t ) == 4 );
438
+ if constexpr (sizeof (store_t ) == 4 ) {
439
+ out_t [i] = dst_u32[i];
440
+ } else {
441
+ out_t [2 * i] = dst_u32[i] & 0xFFFFU ;
442
+ out_t [2 * i + 1 ] = dst_u32[i] >> 16 ;
443
+ }
444
+ }
380
445
}
381
446
382
447
void
@@ -387,8 +452,20 @@ preprocess_r12l_to_rg48(int width, int height, void *src, void *dst)
387
452
dim3 blocks ((((width + 7 ) / 8 ) + 255 ) / 256 , height);
388
453
389
454
MEASURE_KERNEL_DURATION_START (0 )
390
- kernel_r12l_to_rg48<<<blocks, threads_per_block>>> (
391
- (uint8_t *) src, (uint8_t *) dst, width);
455
+ if (width % 2 == 0 ) {
456
+ kernel_r12l_to_rg48<uint32_t ><<<blocks, threads_per_block>>> (
457
+ (uint8_t *) src, (uint8_t *) dst, width);
458
+ } else {
459
+ thread_local bool warn_print;
460
+ if (!warn_print) {
461
+ fprintf (stderr,
462
+ " %s: Odd width %d px will use slower kernel!\n " ,
463
+ __func__, width);
464
+ warn_print = true ;
465
+ }
466
+ kernel_r12l_to_rg48<uint16_t ><<<blocks, threads_per_block>>> (
467
+ (uint8_t *) src, (uint8_t *) dst, width);
468
+ }
392
469
MEASURE_KERNEL_DURATION_STOP (0 )
393
470
}
394
471
0 commit comments