@@ -430,28 +430,28 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
430
430
+ float32x4_t luma4 = vdupq_n_f32(0);
431
431
+ float32x4_t overbright4;
432
432
+ // Group A
433
- + luma4 = vmlaq_n_f32 (luma4, r_linx4a, (float)av_q2d(coeffs->cr));
434
- + luma4 = vmlaq_n_f32 (luma4, g_linx4a, (float)av_q2d(coeffs->cg));
435
- + luma4 = vmlaq_n_f32 (luma4, b_linx4a, (float)av_q2d(coeffs->cb));
433
+ + luma4 = vfmaq_n_f32 (luma4, r_linx4a, (float)av_q2d(coeffs->cr));
434
+ + luma4 = vfmaq_n_f32 (luma4, g_linx4a, (float)av_q2d(coeffs->cg));
435
+ + luma4 = vfmaq_n_f32 (luma4, b_linx4a, (float)av_q2d(coeffs->cb));
436
436
+ overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
437
- + r_linx4a = vmlsq_f32 (r_linx4a, r_linx4a, overbright4);
438
- + r_linx4a = vmlaq_f32 (r_linx4a, luma4, overbright4);
439
- + g_linx4a = vmlsq_f32 (g_linx4a, g_linx4a, overbright4);
440
- + g_linx4a = vmlaq_f32 (g_linx4a, luma4, overbright4);
441
- + b_linx4a = vmlsq_f32 (b_linx4a, b_linx4a, overbright4);
442
- + b_linx4a = vmlaq_f32 (b_linx4a, luma4, overbright4);
437
+ + r_linx4a = vfmsq_f32 (r_linx4a, r_linx4a, overbright4);
438
+ + r_linx4a = vfmaq_f32 (r_linx4a, luma4, overbright4);
439
+ + g_linx4a = vfmsq_f32 (g_linx4a, g_linx4a, overbright4);
440
+ + g_linx4a = vfmaq_f32 (g_linx4a, luma4, overbright4);
441
+ + b_linx4a = vfmsq_f32 (b_linx4a, b_linx4a, overbright4);
442
+ + b_linx4a = vfmaq_f32 (b_linx4a, luma4, overbright4);
443
443
+ // Group B
444
444
+ luma4 = vdupq_n_f32(0);
445
- + luma4 = vmlaq_n_f32 (luma4, r_linx4b, (float)av_q2d(coeffs->cr));
446
- + luma4 = vmlaq_n_f32 (luma4, g_linx4b, (float)av_q2d(coeffs->cg));
447
- + luma4 = vmlaq_n_f32 (luma4, b_linx4b, (float)av_q2d(coeffs->cb));
445
+ + luma4 = vfmaq_n_f32 (luma4, r_linx4b, (float)av_q2d(coeffs->cr));
446
+ + luma4 = vfmaq_n_f32 (luma4, g_linx4b, (float)av_q2d(coeffs->cg));
447
+ + luma4 = vfmaq_n_f32 (luma4, b_linx4b, (float)av_q2d(coeffs->cb));
448
448
+ overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
449
- + r_linx4b = vmlsq_f32 (r_linx4b, r_linx4b, overbright4);
450
- + r_linx4b = vmlaq_f32 (r_linx4b, luma4, overbright4);
451
- + g_linx4b = vmlsq_f32 (g_linx4b, g_linx4b, overbright4);
452
- + g_linx4b = vmlaq_f32 (g_linx4b, luma4, overbright4);
453
- + b_linx4b = vmlsq_f32 (b_linx4b, b_linx4b, overbright4);
454
- + b_linx4b = vmlaq_f32 (b_linx4b, luma4, overbright4);
449
+ + r_linx4b = vfmsq_f32 (r_linx4b, r_linx4b, overbright4);
450
+ + r_linx4b = vfmaq_f32 (r_linx4b, luma4, overbright4);
451
+ + g_linx4b = vfmsq_f32 (g_linx4b, g_linx4b, overbright4);
452
+ + g_linx4b = vfmaq_f32 (g_linx4b, luma4, overbright4);
453
+ + b_linx4b = vfmsq_f32 (b_linx4b, b_linx4b, overbright4);
454
+ + b_linx4b = vfmaq_f32 (b_linx4b, luma4, overbright4);
455
455
+ }
456
456
+
457
457
+ r_linx4a = vmulq_f32(r_linx4a, mapvalx4a);
@@ -462,12 +462,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
462
462
+ g_linx4b = vmulq_f32(g_linx4b, mapvalx4b);
463
463
+ b_linx4b = vmulq_f32(b_linx4b, mapvalx4b);
464
464
+
465
- + r_linx4a = vmlaq_n_f32 (offset, r_linx4a, 32767);
466
- + r_linx4b = vmlaq_n_f32 (offset, r_linx4b, 32767);
467
- + g_linx4a = vmlaq_n_f32 (offset, g_linx4a, 32767);
468
- + g_linx4b = vmlaq_n_f32 (offset, g_linx4b, 32767);
469
- + b_linx4a = vmlaq_n_f32 (offset, b_linx4a, 32767);
470
- + b_linx4b = vmlaq_n_f32 (offset, b_linx4b, 32767);
465
+ + r_linx4a = vfmaq_n_f32 (offset, r_linx4a, 32767);
466
+ + r_linx4b = vfmaq_n_f32 (offset, r_linx4b, 32767);
467
+ + g_linx4a = vfmaq_n_f32 (offset, g_linx4a, 32767);
468
+ + g_linx4b = vfmaq_n_f32 (offset, g_linx4b, 32767);
469
+ + b_linx4a = vfmaq_n_f32 (offset, b_linx4a, 32767);
470
+ + b_linx4b = vfmaq_n_f32 (offset, b_linx4b, 32767);
471
471
+
472
472
+ rx4a = vcvtq_s32_f32(r_linx4a);
473
473
+ rx4a = vminq_s32(rx4a, output_upper_bound);
0 commit comments