Merge branch 'CESNET:master' into wip-cmpto-j2k-cpu

ATrivialAtomic · web-flow · commit d3f1ae4771a1 · 2024-09-10T18:15:37.000-04:00
diff --git a/src/audio/playback/alsa.c b/src/audio/playback/alsa.c
@@ -448,18 +448,18 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode,
         enum {
                 REC_MIN_BUF_US = 5000,
         };
-        unsigned int buf_len = 0;
+        unsigned int buf_len_us = 0;
         int          buf_dir = -1;
         const char  *buff_param = get_commandline_param("alsa-playback-buffer");
 
         if (get_commandline_param("low-latency-audio") != NULL &&
             buff_param == NULL) {
                 // set minimal value from the configuration space
                 CHECK_OK(snd_pcm_hw_params_set_buffer_time_first(
-                    handle, params, &buf_len, &buf_dir));
+                    handle, params, &buf_len_us, &buf_dir));
                 MSG(INFO, "ALSA driver buffer len set to: %lf ms\n",
-                    buf_len / US_IN_1MS_DBL);
-                if (buf_len <= REC_MIN_BUF_US) {
+                    US_TO_MS((double) buf_len_us));
+                if (buf_len_us <= REC_MIN_BUF_US) {
                         MSG(WARNING,
                             "ALSA driver buffer len less than %d usec seem to "
                             "be too loow, consider using alsa-playback-buffer "
@@ -469,22 +469,20 @@ set_device_buffer(snd_pcm_t *handle, playback_mode_t playback_mode,
                 return;
         }
 
-        if (buff_param != NULL) {
-                buf_len = atoi(buff_param);
-        } else {
-                buf_len = (playback_mode == SYNC ? BUF_LEN_DEFAULT_SYNC_MS
-                                                 : BUF_LEN_DEFAULT_MS) *
-                          US_IN_1MS;
-        }
+        buf_len_us = buff_param != NULL ? atoi(buff_param)
+                                        : MS_TO_US(playback_mode == SYNC
+                                                       ? BUF_LEN_DEFAULT_SYNC_MS
+                                                       : BUF_LEN_DEFAULT_MS);
 
         const int rc = snd_pcm_hw_params_set_buffer_time_near(
-            handle, params, &buf_len, &buf_dir);
+            handle, params, &buf_len_us, &buf_dir);
         if (rc < 0) {
-                MSG(WARNING, "Warning - unable to set buffer to its size: %s\n",
-                    snd_strerror(rc));
+                MSG(WARNING,
+                    "Warning - unable to set buffer to its size %u us: %s\n",
+                    buf_len_us, snd_strerror(rc));
         }
         MSG(INFO, "ALSA driver buffer len set to: %lf ms\n",
-            buf_len / US_IN_1MS_DBL);
+            US_TO_MS((double) buf_len_us));
 }
 
 ADD_TO_PARAM("alsa-play-period-size", "* alsa-play-period-size=<frames>\n"
diff --git a/src/cuda_wrapper/kernels.cu b/src/cuda_wrapper/kernels.cu
@@ -41,11 +41,8 @@
 #include <cstdio>
 #include <cuda_runtime_api.h>
 
-#ifdef DEBUG
-#define D_PRINTF printf
-#else
-#define D_PRINTF(...)
-#endif
+extern volatile int log_level;
+#define LOG_LEVEL_DEBUG 7
 
 #define MEASURE_KERNEL_DURATION_START(stream) \
         cudaEvent_t t0, t1; \
@@ -57,7 +54,9 @@
         cudaEventSynchronize(t1); \
         float elapsedTime = NAN; \
         cudaEventElapsedTime(&elapsedTime, t0, t1); \
-        D_PRINTF("%s elapsed time: %f ms\n", __func__, elapsedTime); \
+        if (log_level >= LOG_LEVEL_DEBUG) { \
+                printf("%s elapsed time: %f ms\n", __func__, elapsedTime); \
+        } \
         if (elapsedTime > 10.0) { \
                 fprintf( \
                     stderr, \
@@ -73,9 +72,26 @@
 /**
  * modified @ref vc_copylineRG48toR12L
  */
+template <typename load_t>
 __device__ static void
-rt48_to_r12l_compute_blk(const uint8_t *src, uint8_t *dst)
+rt48_to_r12l_compute_blk(const uint8_t *in, uint8_t *out)
 {
+         // load the data from in to src_u32
+         auto    *in_t = (load_t *) in;
+         uint32_t src_u32[12];
+         for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) {
+                 static_assert(sizeof(load_t) == 2 || sizeof(load_t) == 4);
+                 if constexpr (sizeof(load_t) == 4) {
+                         src_u32[i] = in_t[i];
+                 } else {
+                         src_u32[i] = in_t[2 * i] | in_t[2 * i + 1] << 16;
+                 }
+         }
+
+        uint32_t dst_u32[9];
+        auto *dst = (uint8_t *) dst_u32;
+        auto *src = (uint8_t *) src_u32;
+
         // 0
         dst[0] = src[0] >> 4;
         dst[0] |= src[1] << 4;
@@ -191,21 +207,29 @@ rt48_to_r12l_compute_blk(const uint8_t *src, uint8_t *dst)
         dst[32 + 2] |= src[0] & 0xF0;
         dst[32 + 3] = src[1];
         src += 2;
+
+        // store the result
+        auto *out_u32 = (uint32_t *) out;
+        for (unsigned i = 0; i < sizeof dst_u32 / sizeof dst_u32[0]; ++i) {
+                out_u32[i] = dst_u32[i];
+        }
 }
 
+template <typename load_t>
 __device__ static void
 rt48_to_r12l_compute_last_blk(uint8_t *src, uint8_t *dst, unsigned width)
 {
-        uint8_t tmp[48];
+        alignas(uint32_t) uint8_t tmp[48];
         for (unsigned i = 0; i < width * 6; ++i) {
                 tmp[i] = src[i];
         }
-        rt48_to_r12l_compute_blk(tmp, dst);
+        rt48_to_r12l_compute_blk<load_t>(tmp, dst);
 }
 
 /**
  * @todo fix the last block for widths not divisible by 8
  */
+template <typename load_t>
 __global__ static void
 kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x)
 {
@@ -220,11 +244,11 @@ kernel_rg48_to_r12l(uint8_t *in, uint8_t *out, unsigned size_x)
 
         // handle incomplete blocks
         if (position_x == size_x / 8) {
-                rt48_to_r12l_compute_last_blk(src, dst,
-                                              size_x - position_x * 8);
+                rt48_to_r12l_compute_last_blk<load_t>(src, dst,
+                                                      size_x - position_x * 8);
                 return;
         }
-        rt48_to_r12l_compute_blk(src, dst);
+        rt48_to_r12l_compute_blk<load_t>(src, dst);
 }
 
 /**
@@ -252,9 +276,24 @@ int postprocess_rg48_to_r12l(
 
         MEASURE_KERNEL_DURATION_START(stream)
 
-        kernel_rg48_to_r12l<<<blocks, threads_per_block, 0,
-                              (cudaStream_t) stream>>>(
-            (uint8_t *) input_samples, (uint8_t *) output_buffer, size_x);
+        if (size_x % 2 == 0) {
+                kernel_rg48_to_r12l<uint32_t>
+                    <<<blocks, threads_per_block, 0, (cudaStream_t) stream>>>(
+                        (uint8_t *) input_samples, (uint8_t *) output_buffer,
+                        size_x);
+        } else {
+                thread_local bool warn_print;
+                if (!warn_print) {
+                        fprintf(stderr,
+                                "%s: Odd width %d px will use slower kernel!\n",
+                                __func__, size_x);
+                        warn_print = true;
+                }
+                kernel_rg48_to_r12l<uint16_t>
+                    <<<blocks, threads_per_block, 0, (cudaStream_t) stream>>>(
+                        (uint8_t *) input_samples, (uint8_t *) output_buffer,
+                        size_x);
+        }
 
         MEASURE_KERNEL_DURATION_STOP(stream)
 
@@ -266,9 +305,11 @@ int postprocess_rg48_to_r12l(
 //   / , _/ / /  / __/  / /__/___/ > > / , _// (_ / /_  _// _  |
 //  /_/|_| /_/  /____/ /____/     /_/ /_/|_| \___/   /_/  \___/
 
+template <typename store_t>
 __device__ static void r12l_to_rg48_compute_blk(const uint8_t *src,
                                                 uint8_t       *dst);
 
+template <typename store_t>
 __global__ static void
 kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x)
 {
@@ -283,20 +324,32 @@ kernel_r12l_to_rg48(uint8_t *in, uint8_t *out, unsigned size_x)
 
         if (position_x == size_x / 8) {
                 // compute the last incomplete block
-                uint8_t tmp[48];
-                r12l_to_rg48_compute_blk(src, tmp);
+                alignas(uint32_t) uint8_t tmp[48];
+                r12l_to_rg48_compute_blk<store_t>(src, tmp);
                 for (unsigned i = 0; i < (size_x - position_x * 8) * 6; ++i) {
                         dst[i] = tmp[i];
                 }
                 return;
         }
-        r12l_to_rg48_compute_blk(src, dst);
+        r12l_to_rg48_compute_blk<store_t>(src, dst);
 }
 
 /// adapted variant of @ref vc_copylineR12LtoRG48
+template <typename store_t>
 __device__ static void
-r12l_to_rg48_compute_blk(const uint8_t *src, uint8_t *dst)
+r12l_to_rg48_compute_blk(const uint8_t *in, uint8_t *out)
 {
+        // load the data from in to src_u32
+        auto *in_u32 = (uint32_t *) in;
+        uint32_t src_u32[9];
+        for (unsigned i = 0; i < sizeof src_u32 / sizeof src_u32[0]; ++i) {
+                src_u32[i] = in_u32[i];
+        }
+
+        uint32_t dst_u32[12];
+        uint8_t *dst = (uint8_t *) dst_u32;
+        uint8_t *src = (uint8_t *) src_u32;
+
         // 0
         // R
         *dst++ = src[0] << 4;
@@ -377,6 +430,18 @@ r12l_to_rg48_compute_blk(const uint8_t *src, uint8_t *dst)
 
         *dst++ = src[32 + 2] & 0xF0;
         *dst++ = src[32 + 3];
+
+        // store the result
+        auto *out_t = (store_t *) out;
+        for (unsigned i = 0; i < sizeof dst_u32 / sizeof dst_u32[0]; ++i) {
+                static_assert(sizeof(store_t) == 2 || sizeof(store_t) == 4);
+                if constexpr (sizeof(store_t) == 4) {
+                        out_t[i] = dst_u32[i];
+                } else  {
+                        out_t[2 * i] = dst_u32[i] & 0xFFFFU;
+                        out_t[2 * i + 1] = dst_u32[i] >> 16;
+                }
+        }
 }
 
 void
@@ -387,8 +452,20 @@ preprocess_r12l_to_rg48(int width, int height, void *src, void *dst)
         dim3 blocks((((width + 7) / 8) + 255) / 256, height);
 
         MEASURE_KERNEL_DURATION_START(0)
-        kernel_r12l_to_rg48<<<blocks, threads_per_block>>>(
-            (uint8_t *) src, (uint8_t *) dst, width);
+        if (width % 2 == 0) {
+                kernel_r12l_to_rg48<uint32_t><<<blocks, threads_per_block>>>(
+                    (uint8_t *) src, (uint8_t *) dst, width);
+        } else {
+                thread_local bool warn_print;
+                if (!warn_print) {
+                        fprintf(stderr,
+                                "%s: Odd width %d px will use slower kernel!\n",
+                                __func__, width);
+                        warn_print = true;
+                }
+                kernel_r12l_to_rg48<uint16_t><<<blocks, threads_per_block>>>(
+                    (uint8_t *) src, (uint8_t *) dst, width);
+        }
         MEASURE_KERNEL_DURATION_STOP(0)
 }
 
diff --git a/src/export.c b/src/export.c
@@ -35,7 +35,6 @@
  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <assert.h>
 #include <dirent.h>
 #include <errno.h>            // for errno, EEXIST
 #include <limits.h>
@@ -341,7 +340,9 @@ static void process_messages(struct exporter *s) {
 
 void export_audio(struct exporter *s, struct audio_frame *frame)
 {
-        assert(s != NULL);
+        if(!s){
+                return;
+        }
 
         process_messages(s);
 
@@ -354,7 +355,9 @@ void export_audio(struct exporter *s, struct audio_frame *frame)
 
 void export_video(struct exporter *s, struct video_frame *frame)
 {
-        assert(s != NULL);
+        if(!s){
+                return;
+        }
 
         process_messages(s);
 
diff --git a/src/tv.h b/src/tv.h
@@ -86,8 +86,6 @@ typedef long long time_ns_t;
 #define MS_IN_NS_DBL 1000000.0
 #define MS_IN_SEC 1000
 #define MS_IN_SEC_DBL 1000.0
-#define US_IN_1MS 1000
-#define US_IN_1MS_DBL 1000.0
 #define US_IN_SEC 1000000LL
 #define US_IN_NS 1000LL
 #define US_IN_SEC_DBL ((double) US_IN_SEC)
@@ -97,6 +95,10 @@ typedef long long time_ns_t;
 #define NS_IN_SEC_DBL ((double) NS_IN_SEC)
 #define NS_IN_US (NS_IN_SEC/US_IN_SEC)
 #define NS_IN_US_DBL ((double) NS_IN_US)
+#define US_TO_MS(val_us) ((val_us) / 1000)
+#define MS_TO_US(val_ms) ((val_ms) * 1000)
+#define NS_TO_MS(val_ns) ((val_ns) / 1000 / 1000)
+
 static inline time_ns_t get_time_in_ns() {
 #ifdef HAVE_TIMESPEC_GET
         struct timespec ts = { 0, 0 };
diff --git a/src/video_compress/cmpto_j2k.cpp b/src/video_compress/cmpto_j2k.cpp
@@ -331,9 +331,14 @@ static void parallel_conv(video_frame *dst, video_frame *src){
         decoder_t decoder =
             get_decoder_from_to(src->color_spec, dst->color_spec);
         assert(decoder != nullptr);
+        time_ns_t t0 = get_time_in_ns();
         parallel_pix_conv((int) src->tiles[0].height, dst->tiles[0].data,
                           dst_pitch, src->tiles[0].data, src_pitch,
                           decoder, 0);
+        if (log_level >= LOG_LEVEL_DEBUG) {
+                MSG(DEBUG, "pixfmt conversion duration: %f ms\n",
+                    NS_TO_MS((double) (get_time_in_ns() - t0)));
+        }
 }
 
 static struct {
diff --git a/src/video_decompress/cmpto_j2k.cpp b/src/video_decompress/cmpto_j2k.cpp
@@ -212,9 +212,14 @@ static void rg48_to_r12l(unsigned char *dst_buffer,
         int dst_len = vc_get_linesize(width, R12L);
         decoder_t vc_copylineRG48toR12L = get_decoder_from_to(RG48, R12L);
 
+        time_ns_t t0 = get_time_in_ns();
         parallel_pix_conv((int) height, (char *) dst_buffer, dst_len,
                           (const char *) src_buffer, src_pitch,
                           vc_copylineRG48toR12L, 0);
+        if (log_level >= LOG_LEVEL_DEBUG) {
+                MSG(DEBUG, "pixfmt conversion duration: %f ms\n",
+                    NS_TO_MS((double) (get_time_in_ns() - t0)));
+        }
 }
 
 static void print_dropped(unsigned long long int dropped, const j2k_decompress_platform& platform) {