Skip to content

Commit d3f1ae4

Browse files
Merge branch 'CESNET:master' into wip-cmpto-j2k-cpu
2 parents 392b3b4 + 8b7db97 commit d3f1ae4

File tree

6 files changed

+131
-41
lines changed

6 files changed

+131
-41
lines changed

src/audio/playback/alsa.c

+13-15
Original file line numberDiff line numberDiff line change
@@ -448,18 +448,18 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode,
448448
enum {
449449
REC_MIN_BUF_US = 5000,
450450
};
451-
unsigned int buf_len = 0;
451+
unsigned int buf_len_us = 0;
452452
int buf_dir = -1;
453453
const char *buff_param = get_commandline_param("alsa-playback-buffer");
454454

455455
if (get_commandline_param("low-latency-audio") != NULL &&
456456
buff_param == NULL) {
457457
// set minimal value from the configuration space
458458
CHECK_OK(snd_pcm_hw_params_set_buffer_time_first(
459-
handle, params, &buf_len, &buf_dir));
459+
handle, params, &buf_len_us, &buf_dir));
460460
MSG(INFO, "ALSA driver buffer len set to: %lf ms\n",
461-
buf_len / US_IN_1MS_DBL);
462-
if (buf_len <= REC_MIN_BUF_US) {
461+
US_TO_MS((double) buf_len_us));
462+
if (buf_len_us <= REC_MIN_BUF_US) {
463463
MSG(WARNING,
464464
"ALSA driver buffer len less than %d usec seem to "
465465
"be too loow, consider using alsa-playback-buffer "
@@ -469,22 +469,20 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode,
469469
return;
470470
}
471471

472-
if (buff_param != NULL) {
473-
buf_len = atoi(buff_param);
474-
} else {
475-
buf_len = (playback_mode == SYNC ? BUF_LEN_DEFAULT_SYNC_MS
476-
: BUF_LEN_DEFAULT_MS) *
477-
US_IN_1MS;
478-
}
472+
buf_len_us = buff_param != NULL ? atoi(buff_param)
473+
: MS_TO_US(playback_mode == SYNC
474+
? BUF_LEN_DEFAULT_SYNC_MS
475+
: BUF_LEN_DEFAULT_MS);
479476

480477
const int rc = snd_pcm_hw_params_set_buffer_time_near(
481-
handle, params, &buf_len, &buf_dir);
478+
handle, params, &buf_len_us, &buf_dir);
482479
if (rc < 0) {
483-
MSG(WARNING, "Warning - unable to set buffer to its size: %s\n",
484-
snd_strerror(rc));
480+
MSG(WARNING,
481+
"Warning - unable to set buffer to its size %u us: %s\n",
482+
buf_len_us, snd_strerror(rc));
485483
}
486484
MSG(INFO, "ALSA driver buffer len set to: %lf ms\n",
487-
buf_len / US_IN_1MS_DBL);
485+
US_TO_MS((double) buf_len_us));
488486
}
489487

490488
ADD_TO_PARAM("alsa-play-period-size", "* alsa-play-period-size=<frames>\n"

src/cuda_wrapper/kernels.cu

+98-21
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,8 @@
4141
#include <cstdio>
4242
#include <cuda_runtime_api.h>
4343

44-
#ifdef DEBUG
45-
#define D_PRINTF printf
46-
#else
47-
#define D_PRINTF(...)
48-
#endif
44+
extern volatile int log_level;
45+
#define LOG_LEVEL_DEBUG 7
4946

5047
#define MEASURE_KERNEL_DURATION_START(stream) \
5148
cudaEvent_t t0, t1; \
@@ -57,7 +54,9 @@
5754
cudaEventSynchronize(t1); \
5855
float elapsedTime = NAN; \
5956
cudaEventElapsedTime(&elapsedTime, t0, t1); \
60-
D_PRINTF("%s elapsed time: %f ms\n", __func__, elapsedTime); \
57+
if (log_level >= LOG_LEVEL_DEBUG) { \
58+
printf("%s elapsed time: %f ms\n", __func__, elapsedTime); \
59+
} \
6160
if (elapsedTime > 10.0) { \
6261
fprintf( \
6362
stderr, \
@@ -73,9 +72,26 @@
7372
/**
7473
* modified @ref vc_copylineRG48toR12L
7574
*/
75+
template <typename load_t>
7676
__device__ static void
77-
rt48_to_r12l_compute_blk(const uint8_t *src, uint8_t *dst)
77+
rt48_to_r12l_compute_blk(const uint8_t *in, uint8_t *out)
7878
{
79+
// load the data from in to src_u32
80+
auto *in_t = (load_t *) in;
81+
uint32_t src_u32[12];
82+
for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) {
83+
static_assert(sizeof(load_t) == 2 || sizeof(load_t) == 4);
84+
if constexpr (sizeof(load_t) == 4) {
85+
src_u32[i] = in_t[i];
86+
} else {
87+
src_u32[i] = in_t[2 * i] | in_t[2 * i + 1] << 16;
88+
}
89+
}
90+
91+
uint32_t dst_u32[9];
92+
auto *dst = (uint8_t *) dst_u32;
93+
auto *src = (uint8_t *) src_u32;
94+
7995
// 0
8096
dst[0] = src[0] >> 4;
8197
dst[0] |= src[1] << 4;
@@ -191,21 +207,29 @@ rt48_to_r12l_compute_blk(const uint8_t *src, uint8_t *dst)
191207
dst[32 + 2] |= src[0] & 0xF0;
192208
dst[32 + 3] = src[1];
193209
src += 2;
210+
211+
// store the result
212+
auto *out_u32 = (uint32_t *) out;
213+
for (unsigned i = 0; i < sizeof dst_u32 / sizeof dst_u32[0]; ++i) {
214+
out_u32[i] = dst_u32[i];
215+
}
194216
}
195217

218+
template <typename load_t>
196219
__device__ static void
197220
rt48_to_r12l_compute_last_blk(uint8_t *src, uint8_t *dst, unsigned width)
198221
{
199-
uint8_t tmp[48];
222+
alignas(uint32_t) uint8_t tmp[48];
200223
for (unsigned i = 0; i < width * 6; ++i) {
201224
tmp[i] = src[i];
202225
}
203-
rt48_to_r12l_compute_blk(tmp, dst);
226+
rt48_to_r12l_compute_blk<load_t>(tmp, dst);
204227
}
205228

206229
/**
207230
* @todo fix the last block for widths not divisible by 8
208231
*/
232+
template <typename load_t>
209233
__global__ static void
210234
kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x)
211235
{
@@ -220,11 +244,11 @@ kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x)
220244

221245
// handle incomplete blocks
222246
if (position_x == size_x / 8) {
223-
rt48_to_r12l_compute_last_blk(src, dst,
224-
size_x - position_x * 8);
247+
rt48_to_r12l_compute_last_blk<load_t>(src, dst,
248+
size_x - position_x * 8);
225249
return;
226250
}
227-
rt48_to_r12l_compute_blk(src, dst);
251+
rt48_to_r12l_compute_blk<load_t>(src, dst);
228252
}
229253

230254
/**
@@ -252,9 +276,24 @@ int postprocess_rg48_to_r12l(
252276

253277
MEASURE_KERNEL_DURATION_START(stream)
254278

255-
kernel_rg48_to_r12l<<<blocks, threads_per_block, 0,
256-
(cudaStream_t) stream>>>(
257-
(uint8_t *) input_samples, (uint8_t *) output_buffer, size_x);
279+
if (size_x % 2 == 0) {
280+
kernel_rg48_to_r12l<uint32_t>
281+
<<<blocks, threads_per_block, 0, (cudaStream_t) stream>>>(
282+
(uint8_t *) input_samples, (uint8_t *) output_buffer,
283+
size_x);
284+
} else {
285+
thread_local bool warn_print;
286+
if (!warn_print) {
287+
fprintf(stderr,
288+
"%s: Odd width %d px will use slower kernel!\n",
289+
__func__, size_x);
290+
warn_print = true;
291+
}
292+
kernel_rg48_to_r12l<uint16_t>
293+
<<<blocks, threads_per_block, 0, (cudaStream_t) stream>>>(
294+
(uint8_t *) input_samples, (uint8_t *) output_buffer,
295+
size_x);
296+
}
258297

259298
MEASURE_KERNEL_DURATION_STOP(stream)
260299

@@ -266,9 +305,11 @@ int postprocess_rg48_to_r12l(
266305
// / , _/ / / / __/ / /__/___/ > > / , _// (_ / /_ _// _ |
267306
// /_/|_| /_/ /____/ /____/ /_/ /_/|_| \___/ /_/ \___/
268307

308+
template <typename store_t>
269309
__device__ static void r12l_to_rg48_compute_blk(const uint8_t *src,
270310
uint8_t *dst);
271311

312+
template <typename store_t>
272313
__global__ static void
273314
kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x)
274315
{
@@ -283,20 +324,32 @@ kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x)
283324

284325
if (position_x == size_x / 8) {
285326
// compute the last incomplete block
286-
uint8_t tmp[48];
287-
r12l_to_rg48_compute_blk(src, tmp);
327+
alignas(uint32_t) uint8_t tmp[48];
328+
r12l_to_rg48_compute_blk<store_t>(src, tmp);
288329
for (unsigned i = 0; i < (size_x - position_x * 8) * 6; ++i) {
289330
dst[i] = tmp[i];
290331
}
291332
return;
292333
}
293-
r12l_to_rg48_compute_blk(src, dst);
334+
r12l_to_rg48_compute_blk<store_t>(src, dst);
294335
}
295336

296337
/// adapted variant of @ref vc_copylineR12LtoRG48
338+
template <typename store_t>
297339
__device__ static void
298-
r12l_to_rg48_compute_blk(const uint8_t *src, uint8_t *dst)
340+
r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out)
299341
{
342+
// load the data from in to src_u32
343+
auto *in_u32 = (uint32_t *) in;
344+
uint32_t src_u32[9];
345+
for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) {
346+
src_u32[i] = in_u32[i];
347+
}
348+
349+
uint32_t dst_u32[12];
350+
uint8_t *dst = (uint8_t *) dst_u32;
351+
uint8_t *src = (uint8_t *) src_u32;
352+
300353
// 0
301354
// R
302355
*dst++ = src[0] << 4;
@@ -377,6 +430,18 @@ r12l_to_rg48_compute_blk(const uint8_t *src, uint8_t *dst)
377430

378431
*dst++ = src[32 + 2] & 0xF0;
379432
*dst++ = src[32 + 3];
433+
434+
// store the result
435+
auto *out_t = (store_t *) out;
436+
for (unsigned i = 0; i < sizeof dst_u32 / sizeof dst_u32[0]; ++i) {
437+
static_assert(sizeof(store_t) == 2 || sizeof(store_t) == 4);
438+
if constexpr (sizeof(store_t) == 4) {
439+
out_t[i] = dst_u32[i];
440+
} else {
441+
out_t[2 * i] = dst_u32[i] & 0xFFFFU;
442+
out_t[2 * i + 1] = dst_u32[i] >> 16;
443+
}
444+
}
380445
}
381446

382447
void
@@ -387,8 +452,20 @@ preprocess_r12l_to_rg48(int width, int height, void *src, void *dst)
387452
dim3 blocks((((width + 7) / 8) + 255) / 256, height);
388453

389454
MEASURE_KERNEL_DURATION_START(0)
390-
kernel_r12l_to_rg48<<<blocks, threads_per_block>>>(
391-
(uint8_t *) src, (uint8_t *) dst, width);
455+
if (width % 2 == 0) {
456+
kernel_r12l_to_rg48<uint32_t><<<blocks, threads_per_block>>>(
457+
(uint8_t *) src, (uint8_t *) dst, width);
458+
} else {
459+
thread_local bool warn_print;
460+
if (!warn_print) {
461+
fprintf(stderr,
462+
"%s: Odd width %d px will use slower kernel!\n",
463+
__func__, width);
464+
warn_print = true;
465+
}
466+
kernel_r12l_to_rg48<uint16_t><<<blocks, threads_per_block>>>(
467+
(uint8_t *) src, (uint8_t *) dst, width);
468+
}
392469
MEASURE_KERNEL_DURATION_STOP(0)
393470
}
394471

src/export.c

+6-3
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3636
*/
3737

38-
#include <assert.h>
3938
#include <dirent.h>
4039
#include <errno.h> // for errno, EEXIST
4140
#include <limits.h>
@@ -341,7 +340,9 @@ static void process_messages(struct exporter *s) {
341340

342341
void export_audio(struct exporter *s, struct audio_frame *frame)
343342
{
344-
assert(s != NULL);
343+
if(!s){
344+
return;
345+
}
345346

346347
process_messages(s);
347348

@@ -354,7 +355,9 @@ void export_audio(struct exporter *s, struct audio_frame *frame)
354355

355356
void export_video(struct exporter *s, struct video_frame *frame)
356357
{
357-
assert(s != NULL);
358+
if(!s){
359+
return;
360+
}
358361

359362
process_messages(s);
360363

src/tv.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,6 @@ typedef long long time_ns_t;
8686
#define MS_IN_NS_DBL 1000000.0
8787
#define MS_IN_SEC 1000
8888
#define MS_IN_SEC_DBL 1000.0
89-
#define US_IN_1MS 1000
90-
#define US_IN_1MS_DBL 1000.0
9189
#define US_IN_SEC 1000000LL
9290
#define US_IN_NS 1000LL
9391
#define US_IN_SEC_DBL ((double) US_IN_SEC)
@@ -97,6 +95,10 @@ typedef long long time_ns_t;
9795
#define NS_IN_SEC_DBL ((double) NS_IN_SEC)
9896
#define NS_IN_US (NS_IN_SEC/US_IN_SEC)
9997
#define NS_IN_US_DBL ((double) NS_IN_US)
98+
#define US_TO_MS(val_us) ((val_us) / 1000)
99+
#define MS_TO_US(val_ms) ((val_ms) * 1000)
100+
#define NS_TO_MS(val_ns) ((val_ns) / 1000 / 1000)
101+
100102
static inline time_ns_t get_time_in_ns() {
101103
#ifdef HAVE_TIMESPEC_GET
102104
struct timespec ts = { 0, 0 };

src/video_compress/cmpto_j2k.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -331,9 +331,14 @@ static void parallel_conv(video_frame *dst, video_frame *src){
331331
decoder_t decoder =
332332
get_decoder_from_to(src->color_spec, dst->color_spec);
333333
assert(decoder != nullptr);
334+
time_ns_t t0 = get_time_in_ns();
334335
parallel_pix_conv((int) src->tiles[0].height, dst->tiles[0].data,
335336
dst_pitch, src->tiles[0].data, src_pitch,
336337
decoder, 0);
338+
if (log_level >= LOG_LEVEL_DEBUG) {
339+
MSG(DEBUG, "pixfmt conversion duration: %f ms\n",
340+
NS_TO_MS((double) (get_time_in_ns() - t0)));
341+
}
337342
}
338343

339344
static struct {

src/video_decompress/cmpto_j2k.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -212,9 +212,14 @@ static void rg48_to_r12l(unsigned char *dst_buffer,
212212
int dst_len = vc_get_linesize(width, R12L);
213213
decoder_t vc_copylineRG48toR12L = get_decoder_from_to(RG48, R12L);
214214

215+
time_ns_t t0 = get_time_in_ns();
215216
parallel_pix_conv((int) height, (char *) dst_buffer, dst_len,
216217
(const char *) src_buffer, src_pitch,
217218
vc_copylineRG48toR12L, 0);
219+
if (log_level >= LOG_LEVEL_DEBUG) {
220+
MSG(DEBUG, "pixfmt conversion duration: %f ms\n",
221+
NS_TO_MS((double) (get_time_in_ns() - t0)));
222+
}
218223
}
219224

220225
static void print_dropped(unsigned long long int dropped, const j2k_decompress_platform& platform) {

0 commit comments

Comments
 (0)