Skip to content

Commit 5997665

Browse files
authored
Merge pull request #469 from jellyfin/use-neon-fma-intrin
avfilter/tonemapx: use fma neon intrinsics
2 parents 870ebe2 + 813ec07 commit 5997665

File tree

1 file changed

+24
-24
lines changed

1 file changed

+24
-24
lines changed

debian/patches/0060-add-simd-optimized-tonemapx-filter.patch

+24-24
Original file line numberDiff line numberDiff line change
@@ -430,28 +430,28 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
430430
+ float32x4_t luma4 = vdupq_n_f32(0);
431431
+ float32x4_t overbright4;
432432
+ // Group A
433-
+ luma4 = vmlaq_n_f32(luma4, r_linx4a, (float)av_q2d(coeffs->cr));
434-
+ luma4 = vmlaq_n_f32(luma4, g_linx4a, (float)av_q2d(coeffs->cg));
435-
+ luma4 = vmlaq_n_f32(luma4, b_linx4a, (float)av_q2d(coeffs->cb));
433+
+ luma4 = vfmaq_n_f32(luma4, r_linx4a, (float)av_q2d(coeffs->cr));
434+
+ luma4 = vfmaq_n_f32(luma4, g_linx4a, (float)av_q2d(coeffs->cg));
435+
+ luma4 = vfmaq_n_f32(luma4, b_linx4a, (float)av_q2d(coeffs->cb));
436436
+ overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
437-
+ r_linx4a = vmlsq_f32(r_linx4a, r_linx4a, overbright4);
438-
+ r_linx4a = vmlaq_f32(r_linx4a, luma4, overbright4);
439-
+ g_linx4a = vmlsq_f32(g_linx4a, g_linx4a, overbright4);
440-
+ g_linx4a = vmlaq_f32(g_linx4a, luma4, overbright4);
441-
+ b_linx4a = vmlsq_f32(b_linx4a, b_linx4a, overbright4);
442-
+ b_linx4a = vmlaq_f32(b_linx4a, luma4, overbright4);
437+
+ r_linx4a = vfmsq_f32(r_linx4a, r_linx4a, overbright4);
438+
+ r_linx4a = vfmaq_f32(r_linx4a, luma4, overbright4);
439+
+ g_linx4a = vfmsq_f32(g_linx4a, g_linx4a, overbright4);
440+
+ g_linx4a = vfmaq_f32(g_linx4a, luma4, overbright4);
441+
+ b_linx4a = vfmsq_f32(b_linx4a, b_linx4a, overbright4);
442+
+ b_linx4a = vfmaq_f32(b_linx4a, luma4, overbright4);
443443
+ // Group B
444444
+ luma4 = vdupq_n_f32(0);
445-
+ luma4 = vmlaq_n_f32(luma4, r_linx4b, (float)av_q2d(coeffs->cr));
446-
+ luma4 = vmlaq_n_f32(luma4, g_linx4b, (float)av_q2d(coeffs->cg));
447-
+ luma4 = vmlaq_n_f32(luma4, b_linx4b, (float)av_q2d(coeffs->cb));
445+
+ luma4 = vfmaq_n_f32(luma4, r_linx4b, (float)av_q2d(coeffs->cr));
446+
+ luma4 = vfmaq_n_f32(luma4, g_linx4b, (float)av_q2d(coeffs->cg));
447+
+ luma4 = vfmaq_n_f32(luma4, b_linx4b, (float)av_q2d(coeffs->cb));
448448
+ overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
449-
+ r_linx4b = vmlsq_f32(r_linx4b, r_linx4b, overbright4);
450-
+ r_linx4b = vmlaq_f32(r_linx4b, luma4, overbright4);
451-
+ g_linx4b = vmlsq_f32(g_linx4b, g_linx4b, overbright4);
452-
+ g_linx4b = vmlaq_f32(g_linx4b, luma4, overbright4);
453-
+ b_linx4b = vmlsq_f32(b_linx4b, b_linx4b, overbright4);
454-
+ b_linx4b = vmlaq_f32(b_linx4b, luma4, overbright4);
449+
+ r_linx4b = vfmsq_f32(r_linx4b, r_linx4b, overbright4);
450+
+ r_linx4b = vfmaq_f32(r_linx4b, luma4, overbright4);
451+
+ g_linx4b = vfmsq_f32(g_linx4b, g_linx4b, overbright4);
452+
+ g_linx4b = vfmaq_f32(g_linx4b, luma4, overbright4);
453+
+ b_linx4b = vfmsq_f32(b_linx4b, b_linx4b, overbright4);
454+
+ b_linx4b = vfmaq_f32(b_linx4b, luma4, overbright4);
455455
+ }
456456
+
457457
+ r_linx4a = vmulq_f32(r_linx4a, mapvalx4a);
@@ -462,12 +462,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
462462
+ g_linx4b = vmulq_f32(g_linx4b, mapvalx4b);
463463
+ b_linx4b = vmulq_f32(b_linx4b, mapvalx4b);
464464
+
465-
+ r_linx4a = vmlaq_n_f32(offset, r_linx4a, 32767);
466-
+ r_linx4b = vmlaq_n_f32(offset, r_linx4b, 32767);
467-
+ g_linx4a = vmlaq_n_f32(offset, g_linx4a, 32767);
468-
+ g_linx4b = vmlaq_n_f32(offset, g_linx4b, 32767);
469-
+ b_linx4a = vmlaq_n_f32(offset, b_linx4a, 32767);
470-
+ b_linx4b = vmlaq_n_f32(offset, b_linx4b, 32767);
465+
+ r_linx4a = vfmaq_n_f32(offset, r_linx4a, 32767);
466+
+ r_linx4b = vfmaq_n_f32(offset, r_linx4b, 32767);
467+
+ g_linx4a = vfmaq_n_f32(offset, g_linx4a, 32767);
468+
+ g_linx4b = vfmaq_n_f32(offset, g_linx4b, 32767);
469+
+ b_linx4a = vfmaq_n_f32(offset, b_linx4a, 32767);
470+
+ b_linx4b = vfmaq_n_f32(offset, b_linx4b, 32767);
471471
+
472472
+ rx4a = vcvtq_s32_f32(r_linx4a);
473473
+ rx4a = vminq_s32(rx4a, output_upper_bound);

0 commit comments

Comments
 (0)