lavfi/hqdn3d: add slice thread optimization
authorJun Zhao <barryjzhao@tencent.com>
Wed, 2 Oct 2019 02:31:13 +0000 (10:31 +0800)
committerJun Zhao <barryjzhao@tencent.com>
Thu, 10 Oct 2019 01:33:09 +0000 (09:33 +0800)
Enabled one thread per plane, used the test command for 1080P video
(YUV420P format) as follow:

ffmpeg -i 1080p.mp4 -an -vf hqdn3d -f null /dev/nul

This optimization improved the performance about 30% in 1080P YUV420P
case (from 110fps to 143fps), also pass the framemd5 check and FATE.

Reviewed-by: Paul B Mahol <onemda@gmail.com>
Reviewed-by: Moritz Barsnick <barsnick@gmx.net>
Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
libavfilter/vf_hqdn3d.c
libavfilter/vf_hqdn3d.h

index d6c14bb..e50d30e 100644 (file)
@@ -223,7 +223,9 @@ static av_cold void uninit(AVFilterContext *ctx)
     av_freep(&s->coefs[1]);
     av_freep(&s->coefs[2]);
     av_freep(&s->coefs[3]);
-    av_freep(&s->line);
+    av_freep(&s->line[0]);
+    av_freep(&s->line[1]);
+    av_freep(&s->line[2]);
     av_freep(&s->frame_prev[0]);
     av_freep(&s->frame_prev[1]);
     av_freep(&s->frame_prev[2]);
@@ -271,9 +273,11 @@ static int config_input(AVFilterLink *inlink)
     s->vsub  = desc->log2_chroma_h;
     s->depth = desc->comp[0].depth;
 
-    s->line = av_malloc_array(inlink->w, sizeof(*s->line));
-    if (!s->line)
-        return AVERROR(ENOMEM);
+    for (i = 0; i < 3; i++) {
+        s->line[i] = av_malloc_array(inlink->w, sizeof(*s->line[i]));
+        if (!s->line[i])
+            return AVERROR(ENOMEM);
+    }
 
     for (i = 0; i < 4; i++) {
         s->coefs[i] = precalc_coefs(s->strength[i], s->depth);
@@ -287,14 +291,38 @@ static int config_input(AVFilterLink *inlink)
     return 0;
 }
 
+typedef struct ThreadData {
+    AVFrame *in, *out;
+    int direct;
+} ThreadData;
+
+static int do_denoise(AVFilterContext *ctx, void *data, int job_nr, int n_jobs)
+{
+    HQDN3DContext *s = ctx->priv;
+    const ThreadData *td = data;
+    AVFrame *out = td->out;
+    AVFrame *in = td->in;
+    int direct = td->direct;
+
+    denoise(s, in->data[job_nr], out->data[job_nr],
+                s->line[job_nr], &s->frame_prev[job_nr],
+                AV_CEIL_RSHIFT(in->width,  (!!job_nr * s->hsub)),
+                AV_CEIL_RSHIFT(in->height, (!!job_nr * s->vsub)),
+                in->linesize[job_nr], out->linesize[job_nr],
+                s->coefs[job_nr ? CHROMA_SPATIAL : LUMA_SPATIAL],
+                s->coefs[job_nr ? CHROMA_TMP     : LUMA_TMP]);
+
+    return 0;
+}
+
 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 {
     AVFilterContext *ctx  = inlink->dst;
-    HQDN3DContext *s = ctx->priv;
     AVFilterLink *outlink = ctx->outputs[0];
 
     AVFrame *out;
-    int c, direct = av_frame_is_writable(in) && !ctx->is_disabled;
+    int direct = av_frame_is_writable(in) && !ctx->is_disabled;
+    ThreadData td;
 
     if (direct) {
         out = in;
@@ -308,15 +336,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
         av_frame_copy_props(out, in);
     }
 
-    for (c = 0; c < 3; c++) {
-        denoise(s, in->data[c], out->data[c],
-                s->line, &s->frame_prev[c],
-                AV_CEIL_RSHIFT(in->width,  (!!c * s->hsub)),
-                AV_CEIL_RSHIFT(in->height, (!!c * s->vsub)),
-                in->linesize[c], out->linesize[c],
-                s->coefs[c ? CHROMA_SPATIAL : LUMA_SPATIAL],
-                s->coefs[c ? CHROMA_TMP     : LUMA_TMP]);
-    }
+    td.in = in;
+    td.out = out;
+    td.direct = direct;
+    /* one thread per plane */
+    ctx->internal->execute(ctx, do_denoise, &td, NULL, 3);
 
     if (ctx->is_disabled) {
         av_frame_free(&out);
@@ -370,5 +394,5 @@ AVFilter ff_vf_hqdn3d = {
     .query_formats = query_formats,
     .inputs        = avfilter_vf_hqdn3d_inputs,
     .outputs       = avfilter_vf_hqdn3d_outputs,
-    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
+    .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL | AVFILTER_FLAG_SLICE_THREADS,
 };
index 03a79a1..3279bbc 100644 (file)
@@ -31,7 +31,7 @@
 typedef struct HQDN3DContext {
     const AVClass *class;
     int16_t *coefs[4];
-    uint16_t *line;
+    uint16_t *line[3];
     uint16_t *frame_prev[3];
     double strength[4];
     int hsub, vsub;