FFmpeg  4.4.5
swscale_vsx.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include <inttypes.h>
25 
26 #include "config.h"
27 #include "libswscale/swscale.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "libavutil/mem_internal.h"
32 #include "yuv2rgb_altivec.h"
34 
35 #if HAVE_VSX
36 #define vzero vec_splat_s32(0)
37 
38 #if !HAVE_BIGENDIAN
39 #define GET_LS(a,b,c,s) {\
40  ls = a;\
41  a = vec_vsx_ld(((b) << 1) + 16, s);\
42  }
43 
44 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
45  vector signed short ls;\
46  vector signed int vf1, vf2, i1, i2;\
47  GET_LS(l1, x, perm, src);\
48  i1 = vec_mule(filter, ls);\
49  i2 = vec_mulo(filter, ls);\
50  vf1 = vec_mergeh(i1, i2);\
51  vf2 = vec_mergel(i1, i2);\
52  d1 = vec_add(d1, vf1);\
53  d2 = vec_add(d2, vf2);\
54  } while (0)
55 
56 #define LOAD_FILTER(vf,f) {\
57  vf = vec_vsx_ld(joffset, f);\
58 }
59 #define LOAD_L1(ll1,s,p){\
60  ll1 = vec_vsx_ld(xoffset, s);\
61 }
62 
63 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
64 
65 // The neat trick: We only care for half the elements,
66 // high or low depending on (i<<3)%16 (it's 0 or 8 here),
67 // and we're going to use vec_mule, so we choose
68 // carefully how to "unpack" the elements into the even slots.
69 #define GET_VF4(a, vf, f) {\
70  vf = (vector signed short)vec_vsx_ld(a << 3, f);\
71  vf = vec_mergeh(vf, (vector signed short)vzero);\
72 }
73 #define FIRST_LOAD(sv, pos, s, per) {}
74 #define UPDATE_PTR(s0, d0, s1, d1) {}
75 #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
76  vf = vec_vsx_ld(pos + a, s);\
77 }
78 #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) LOAD_SRCV(pos, a, s, per, v0, v1, vf)
79 #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
80  vf = vec_vsx_ld((a * 2 * filterSize) + (b * 2) + off, f);\
81 }
82 
83 #define FUNC(name) name ## _vsx
84 #include "swscale_ppc_template.c"
85 #undef FUNC
86 
87 #undef vzero
88 
89 #endif /* !HAVE_BIGENDIAN */
90 
91 static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
92  const uint8_t *dither, int offset, int start)
93 {
94  int i;
95  for (i = start; i < dstW; i++) {
96  int val = (src[i] + dither[(i + offset) & 7]) >> 7;
97  dest[i] = av_clip_uint8(val);
98  }
99 }
100 
101 static void yuv2plane1_8_vsx(const int16_t *src, uint8_t *dest, int dstW,
102  const uint8_t *dither, int offset)
103 {
104  const int dst_u = -(uintptr_t)dest & 15;
105  int i, j;
106  LOCAL_ALIGNED(16, int16_t, val, [16]);
107  const vec_u16 shifts = (vec_u16) {7, 7, 7, 7, 7, 7, 7, 7};
108  vec_s16 vi, vileft, ditherleft, ditherright;
109  vec_u8 vd;
110 
111  for (j = 0; j < 16; j++) {
112  val[j] = dither[(dst_u + offset + j) & 7];
113  }
114 
115  ditherleft = vec_ld(0, val);
116  ditherright = vec_ld(0, &val[8]);
117 
118  yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0);
119 
120  for (i = dst_u; i < dstW - 15; i += 16) {
121 
122  vi = vec_vsx_ld(0, &src[i]);
123  vi = vec_adds(ditherleft, vi);
124  vileft = vec_sra(vi, shifts);
125 
126  vi = vec_vsx_ld(0, &src[i + 8]);
127  vi = vec_adds(ditherright, vi);
128  vi = vec_sra(vi, shifts);
129 
130  vd = vec_packsu(vileft, vi);
131  vec_st(vd, 0, &dest[i]);
132  }
133 
134  yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
135 }
136 
137 #if !HAVE_BIGENDIAN
138 
139 #define output_pixel(pos, val) \
140  if (big_endian) { \
141  AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
142  } else { \
143  AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
144  }
145 
146 static void yuv2plane1_nbps_u(const int16_t *src, uint16_t *dest, int dstW,
147  int big_endian, int output_bits, int start)
148 {
149  int i;
150  int shift = 15 - output_bits;
151 
152  for (i = start; i < dstW; i++) {
153  int val = src[i] + (1 << (shift - 1));
154  output_pixel(&dest[i], val);
155  }
156 }
157 
158 static av_always_inline void yuv2plane1_nbps_vsx(const int16_t *src,
159  uint16_t *dest, int dstW,
160  const int big_endian,
161  const int output_bits)
162 {
163  const int dst_u = -(uintptr_t)dest & 7;
164  const int shift = 15 - output_bits;
165  const int add = (1 << (shift - 1));
166  const int clip = (1 << output_bits) - 1;
167  const vec_u16 vadd = (vec_u16) {add, add, add, add, add, add, add, add};
168  const vec_u16 vswap = (vec_u16) vec_splat_u16(big_endian ? 8 : 0);
169  const vec_u16 vshift = (vec_u16) vec_splat_u16(shift);
170  const vec_u16 vlargest = (vec_u16) {clip, clip, clip, clip, clip, clip, clip, clip};
171  vec_u16 v;
172  int i;
173 
174  yuv2plane1_nbps_u(src, dest, dst_u, big_endian, output_bits, 0);
175 
176  for (i = dst_u; i < dstW - 7; i += 8) {
177  v = vec_vsx_ld(0, (const uint16_t *) &src[i]);
178  v = vec_add(v, vadd);
179  v = vec_sr(v, vshift);
180  v = vec_min(v, vlargest);
181  v = vec_rl(v, vswap);
182  vec_st(v, 0, &dest[i]);
183  }
184 
185  yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
186 }
187 
188 static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
189  const int16_t **src, uint16_t *dest, int dstW,
190  int big_endian, int output_bits, int start)
191 {
192  int i;
193  int shift = 11 + 16 - output_bits;
194 
195  for (i = start; i < dstW; i++) {
196  int val = 1 << (shift - 1);
197  int j;
198 
199  for (j = 0; j < filterSize; j++)
200  val += src[j][i] * filter[j];
201 
202  output_pixel(&dest[i], val);
203  }
204 }
205 
206 static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
207  const int16_t **src, uint16_t *dest, int dstW,
208  int big_endian, int output_bits)
209 {
210  const int dst_u = -(uintptr_t)dest & 7;
211  const int shift = 11 + 16 - output_bits;
212  const int add = (1 << (shift - 1));
213  const int clip = (1 << output_bits) - 1;
214  const uint16_t swap = big_endian ? 8 : 0;
215  const vec_u32 vadd = (vec_u32) {add, add, add, add};
216  const vec_u32 vshift = (vec_u32) {shift, shift, shift, shift};
217  const vec_u16 vswap = (vec_u16) {swap, swap, swap, swap, swap, swap, swap, swap};
218  const vec_u16 vlargest = (vec_u16) {clip, clip, clip, clip, clip, clip, clip, clip};
219  const vec_s16 vzero = vec_splat_s16(0);
220  const vec_u8 vperm = (vec_u8) {0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
221  vec_s16 vfilter[MAX_FILTER_SIZE], vin;
222  vec_u16 v;
223  vec_u32 vleft, vright, vtmp;
224  int i, j;
225 
226  for (i = 0; i < filterSize; i++) {
227  vfilter[i] = (vec_s16) {filter[i], filter[i], filter[i], filter[i],
228  filter[i], filter[i], filter[i], filter[i]};
229  }
230 
231  yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
232 
233  for (i = dst_u; i < dstW - 7; i += 8) {
234  vleft = vright = vadd;
235 
236  for (j = 0; j < filterSize; j++) {
237  vin = vec_vsx_ld(0, &src[j][i]);
238  vtmp = (vec_u32) vec_mule(vin, vfilter[j]);
239  vleft = vec_add(vleft, vtmp);
240  vtmp = (vec_u32) vec_mulo(vin, vfilter[j]);
241  vright = vec_add(vright, vtmp);
242  }
243 
244  vleft = vec_sra(vleft, vshift);
245  vright = vec_sra(vright, vshift);
246  v = vec_packsu(vleft, vright);
247  v = (vec_u16) vec_max((vec_s16) v, vzero);
248  v = vec_min(v, vlargest);
249  v = vec_rl(v, vswap);
250  v = vec_perm(v, v, vperm);
251  vec_st(v, 0, &dest[i]);
252  }
253 
254  yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
255 }
256 
257 
258 #undef output_pixel
259 
260 #define output_pixel(pos, val, bias, signedness) \
261  if (big_endian) { \
262  AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
263  } else { \
264  AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
265  }
266 
267 static void yuv2plane1_16_u(const int32_t *src, uint16_t *dest, int dstW,
268  int big_endian, int output_bits, int start)
269 {
270  int i;
271  const int shift = 3;
272 
273  for (i = start; i < dstW; i++) {
274  int val = src[i] + (1 << (shift - 1));
275  output_pixel(&dest[i], val, 0, uint);
276  }
277 }
278 
279 static av_always_inline void yuv2plane1_16_vsx(const int32_t *src,
280  uint16_t *dest, int dstW,
281  const int big_endian,
282  int output_bits)
283 {
284  const int dst_u = -(uintptr_t)dest & 7;
285  const int shift = 3;
286  const int add = (1 << (shift - 1));
287  const vec_u32 vadd = (vec_u32) {add, add, add, add};
288  const vec_u16 vswap = (vec_u16) vec_splat_u16(big_endian ? 8 : 0);
289  const vec_u32 vshift = (vec_u32) vec_splat_u32(shift);
290  vec_u32 v, v2;
291  vec_u16 vd;
292  int i;
293 
294  yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0);
295 
296  for (i = dst_u; i < dstW - 7; i += 8) {
297  v = vec_vsx_ld(0, (const uint32_t *) &src[i]);
298  v = vec_add(v, vadd);
299  v = vec_sr(v, vshift);
300 
301  v2 = vec_vsx_ld(0, (const uint32_t *) &src[i + 4]);
302  v2 = vec_add(v2, vadd);
303  v2 = vec_sr(v2, vshift);
304 
305  vd = vec_packsu(v, v2);
306  vd = vec_rl(vd, vswap);
307 
308  vec_st(vd, 0, &dest[i]);
309  }
310 
311  yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
312 }
313 
314 #if HAVE_POWER8
315 
316 static void yuv2planeX_16_u(const int16_t *filter, int filterSize,
317  const int32_t **src, uint16_t *dest, int dstW,
318  int big_endian, int output_bits, int start)
319 {
320  int i;
321  int shift = 15;
322 
323  for (i = start; i < dstW; i++) {
324  int val = 1 << (shift - 1);
325  int j;
326 
327  /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
328  * filters (or anything with negative coeffs, the range can be slightly
329  * wider in both directions. To account for this overflow, we subtract
330  * a constant so it always fits in the signed range (assuming a
331  * reasonable filterSize), and re-add that at the end. */
332  val -= 0x40000000;
333  for (j = 0; j < filterSize; j++)
334  val += src[j][i] * (unsigned)filter[j];
335 
336  output_pixel(&dest[i], val, 0x8000, int);
337  }
338 }
339 
340 static void yuv2planeX_16_vsx(const int16_t *filter, int filterSize,
341  const int32_t **src, uint16_t *dest, int dstW,
342  int big_endian, int output_bits)
343 {
344  const int dst_u = -(uintptr_t)dest & 7;
345  const int shift = 15;
346  const int bias = 0x8000;
347  const int add = (1 << (shift - 1)) - 0x40000000;
348  const uint16_t swap = big_endian ? 8 : 0;
349  const vec_u32 vadd = (vec_u32) {add, add, add, add};
350  const vec_u32 vshift = (vec_u32) {shift, shift, shift, shift};
351  const vec_u16 vswap = (vec_u16) {swap, swap, swap, swap, swap, swap, swap, swap};
352  const vec_u16 vbias = (vec_u16) {bias, bias, bias, bias, bias, bias, bias, bias};
353  vec_s32 vfilter[MAX_FILTER_SIZE];
354  vec_u16 v;
355  vec_u32 vleft, vright, vtmp;
356  vec_s32 vin32l, vin32r;
357  int i, j;
358 
359  for (i = 0; i < filterSize; i++) {
360  vfilter[i] = (vec_s32) {filter[i], filter[i], filter[i], filter[i]};
361  }
362 
363  yuv2planeX_16_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
364 
365  for (i = dst_u; i < dstW - 7; i += 8) {
366  vleft = vright = vadd;
367 
368  for (j = 0; j < filterSize; j++) {
369  vin32l = vec_vsx_ld(0, &src[j][i]);
370  vin32r = vec_vsx_ld(0, &src[j][i + 4]);
371 
372  vtmp = (vec_u32) vec_mul(vin32l, vfilter[j]);
373  vleft = vec_add(vleft, vtmp);
374  vtmp = (vec_u32) vec_mul(vin32r, vfilter[j]);
375  vright = vec_add(vright, vtmp);
376  }
377 
378  vleft = vec_sra(vleft, vshift);
379  vright = vec_sra(vright, vshift);
380  v = (vec_u16) vec_packs((vec_s32) vleft, (vec_s32) vright);
381  v = vec_add(v, vbias);
382  v = vec_rl(v, vswap);
383  vec_st(v, 0, &dest[i]);
384  }
385 
386  yuv2planeX_16_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
387 }
388 
389 #endif /* HAVE_POWER8 */
390 
391 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
392  yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
393  yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t)
394 
395 #define yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
396 static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \
397  uint8_t *dest, int dstW, \
398  const uint8_t *dither, int offset) \
399 { \
400  yuv2plane1_ ## template_size ## _vsx((const typeX_t *) src, \
401  (uint16_t *) dest, dstW, is_be, bits); \
402 }
403 
404 #define yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t) \
405 static void yuv2planeX_ ## bits ## BE_LE ## _vsx(const int16_t *filter, int filterSize, \
406  const int16_t **src, uint8_t *dest, int dstW, \
407  const uint8_t *dither, int offset)\
408 { \
409  yuv2planeX_## template_size ## _vsx(filter, \
410  filterSize, (const typeX_t **) src, \
411  (uint16_t *) dest, dstW, is_be, bits); \
412 }
413 
414 yuv2NBPS( 9, BE, 1, nbps, int16_t)
415 yuv2NBPS( 9, LE, 0, nbps, int16_t)
416 yuv2NBPS(10, BE, 1, nbps, int16_t)
417 yuv2NBPS(10, LE, 0, nbps, int16_t)
418 yuv2NBPS(12, BE, 1, nbps, int16_t)
419 yuv2NBPS(12, LE, 0, nbps, int16_t)
420 yuv2NBPS(14, BE, 1, nbps, int16_t)
421 yuv2NBPS(14, LE, 0, nbps, int16_t)
422 
423 yuv2NBPS1(16, BE, 1, 16, int32_t)
424 yuv2NBPS1(16, LE, 0, 16, int32_t)
425 #if HAVE_POWER8
426 yuv2NBPSX(16, BE, 1, 16, int32_t)
427 yuv2NBPSX(16, LE, 0, 16, int32_t)
428 #endif
429 
430 #define WRITERGB \
431  R_l = vec_max(R_l, zero32); \
432  R_r = vec_max(R_r, zero32); \
433  G_l = vec_max(G_l, zero32); \
434  G_r = vec_max(G_r, zero32); \
435  B_l = vec_max(B_l, zero32); \
436  B_r = vec_max(B_r, zero32); \
437 \
438  R_l = vec_min(R_l, rgbclip); \
439  R_r = vec_min(R_r, rgbclip); \
440  G_l = vec_min(G_l, rgbclip); \
441  G_r = vec_min(G_r, rgbclip); \
442  B_l = vec_min(B_l, rgbclip); \
443  B_r = vec_min(B_r, rgbclip); \
444 \
445  R_l = vec_sr(R_l, shift22); \
446  R_r = vec_sr(R_r, shift22); \
447  G_l = vec_sr(G_l, shift22); \
448  G_r = vec_sr(G_r, shift22); \
449  B_l = vec_sr(B_l, shift22); \
450  B_r = vec_sr(B_r, shift22); \
451 \
452  rd16 = vec_packsu(R_l, R_r); \
453  gd16 = vec_packsu(G_l, G_r); \
454  bd16 = vec_packsu(B_l, B_r); \
455  rd = vec_packsu(rd16, zero16); \
456  gd = vec_packsu(gd16, zero16); \
457  bd = vec_packsu(bd16, zero16); \
458 \
459  switch(target) { \
460  case AV_PIX_FMT_RGB24: \
461  out0 = vec_perm(rd, gd, perm3rg0); \
462  out0 = vec_perm(out0, bd, perm3tb0); \
463  out1 = vec_perm(rd, gd, perm3rg1); \
464  out1 = vec_perm(out1, bd, perm3tb1); \
465 \
466  vec_vsx_st(out0, 0, dest); \
467  vec_vsx_st(out1, 16, dest); \
468 \
469  dest += 24; \
470  break; \
471  case AV_PIX_FMT_BGR24: \
472  out0 = vec_perm(bd, gd, perm3rg0); \
473  out0 = vec_perm(out0, rd, perm3tb0); \
474  out1 = vec_perm(bd, gd, perm3rg1); \
475  out1 = vec_perm(out1, rd, perm3tb1); \
476 \
477  vec_vsx_st(out0, 0, dest); \
478  vec_vsx_st(out1, 16, dest); \
479 \
480  dest += 24; \
481  break; \
482  case AV_PIX_FMT_BGRA: \
483  out0 = vec_mergeh(bd, gd); \
484  out1 = vec_mergeh(rd, ad); \
485 \
486  tmp8 = (vec_u8) vec_mergeh((vec_u16) out0, (vec_u16) out1); \
487  vec_vsx_st(tmp8, 0, dest); \
488  tmp8 = (vec_u8) vec_mergel((vec_u16) out0, (vec_u16) out1); \
489  vec_vsx_st(tmp8, 16, dest); \
490 \
491  dest += 32; \
492  break; \
493  case AV_PIX_FMT_RGBA: \
494  out0 = vec_mergeh(rd, gd); \
495  out1 = vec_mergeh(bd, ad); \
496 \
497  tmp8 = (vec_u8) vec_mergeh((vec_u16) out0, (vec_u16) out1); \
498  vec_vsx_st(tmp8, 0, dest); \
499  tmp8 = (vec_u8) vec_mergel((vec_u16) out0, (vec_u16) out1); \
500  vec_vsx_st(tmp8, 16, dest); \
501 \
502  dest += 32; \
503  break; \
504  case AV_PIX_FMT_ARGB: \
505  out0 = vec_mergeh(ad, rd); \
506  out1 = vec_mergeh(gd, bd); \
507 \
508  tmp8 = (vec_u8) vec_mergeh((vec_u16) out0, (vec_u16) out1); \
509  vec_vsx_st(tmp8, 0, dest); \
510  tmp8 = (vec_u8) vec_mergel((vec_u16) out0, (vec_u16) out1); \
511  vec_vsx_st(tmp8, 16, dest); \
512 \
513  dest += 32; \
514  break; \
515  case AV_PIX_FMT_ABGR: \
516  out0 = vec_mergeh(ad, bd); \
517  out1 = vec_mergeh(gd, rd); \
518 \
519  tmp8 = (vec_u8) vec_mergeh((vec_u16) out0, (vec_u16) out1); \
520  vec_vsx_st(tmp8, 0, dest); \
521  tmp8 = (vec_u8) vec_mergel((vec_u16) out0, (vec_u16) out1); \
522  vec_vsx_st(tmp8, 16, dest); \
523 \
524  dest += 32; \
525  break; \
526  }
527 
528 static av_always_inline void
529 yuv2rgb_full_X_vsx_template(SwsContext *c, const int16_t *lumFilter,
530  const int16_t **lumSrc, int lumFilterSize,
531  const int16_t *chrFilter, const int16_t **chrUSrc,
532  const int16_t **chrVSrc, int chrFilterSize,
533  const int16_t **alpSrc, uint8_t *dest,
534  int dstW, int y, enum AVPixelFormat target, int hasAlpha)
535 {
536  vec_s16 vv;
537  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
538  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r;
539  vec_s32 tmp, tmp2, tmp3, tmp4;
540  vec_u16 rd16, gd16, bd16;
541  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
542  vec_s16 vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
543  const vec_s32 ystart = vec_splats(1 << 9);
544  const vec_s32 uvstart = vec_splats((1 << 9) - (128 << 19));
545  const vec_u16 zero16 = vec_splat_u16(0);
546  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
547  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
548  const vec_s32 y_add = vec_splats(1 << 21);
549  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
550  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
551  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
552  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
553  const vec_s32 rgbclip = vec_splats(1 << 30);
554  const vec_s32 zero32 = vec_splat_s32(0);
555  const vec_u32 shift22 = vec_splats(22U);
556  const vec_u32 shift10 = vec_splat_u32(10);
557  int i, j;
558 
559  // Various permutations
560  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
561  0x1, 0x11, 0,
562  0x2, 0x12, 0,
563  0x3, 0x13, 0,
564  0x4, 0x14, 0,
565  0x5 };
566  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
567  0x6, 0x16, 0,
568  0x7, 0x17, 0 };
569  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
570  0x3, 0x4, 0x11,
571  0x6, 0x7, 0x12,
572  0x9, 0xa, 0x13,
573  0xc, 0xd, 0x14,
574  0xf };
575  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
576  0x2, 0x3, 0x16,
577  0x5, 0x6, 0x17 };
578 
579  ad = vec_splats((uint8_t) 255);
580 
581  for (i = 0; i < lumFilterSize; i++)
582  vlumFilter[i] = vec_splats(lumFilter[i]);
583  for (i = 0; i < chrFilterSize; i++)
584  vchrFilter[i] = vec_splats(chrFilter[i]);
585 
586  for (i = 0; i < dstW; i += 8) {
587  vy32_l =
588  vy32_r = ystart;
589  vu32_l =
590  vu32_r =
591  vv32_l =
592  vv32_r = uvstart;
593 
594  for (j = 0; j < lumFilterSize; j++) {
595  vv = vec_ld(0, &lumSrc[j][i]);
596  tmp = vec_mule(vv, vlumFilter[j]);
597  tmp2 = vec_mulo(vv, vlumFilter[j]);
598  tmp3 = vec_mergeh(tmp, tmp2);
599  tmp4 = vec_mergel(tmp, tmp2);
600 
601  vy32_l = vec_adds(vy32_l, tmp3);
602  vy32_r = vec_adds(vy32_r, tmp4);
603  }
604 
605  for (j = 0; j < chrFilterSize; j++) {
606  vv = vec_ld(0, &chrUSrc[j][i]);
607  tmp = vec_mule(vv, vchrFilter[j]);
608  tmp2 = vec_mulo(vv, vchrFilter[j]);
609  tmp3 = vec_mergeh(tmp, tmp2);
610  tmp4 = vec_mergel(tmp, tmp2);
611 
612  vu32_l = vec_adds(vu32_l, tmp3);
613  vu32_r = vec_adds(vu32_r, tmp4);
614 
615  vv = vec_ld(0, &chrVSrc[j][i]);
616  tmp = vec_mule(vv, vchrFilter[j]);
617  tmp2 = vec_mulo(vv, vchrFilter[j]);
618  tmp3 = vec_mergeh(tmp, tmp2);
619  tmp4 = vec_mergel(tmp, tmp2);
620 
621  vv32_l = vec_adds(vv32_l, tmp3);
622  vv32_r = vec_adds(vv32_r, tmp4);
623  }
624 
625  vy32_l = vec_sra(vy32_l, shift10);
626  vy32_r = vec_sra(vy32_r, shift10);
627  vu32_l = vec_sra(vu32_l, shift10);
628  vu32_r = vec_sra(vu32_r, shift10);
629  vv32_l = vec_sra(vv32_l, shift10);
630  vv32_r = vec_sra(vv32_r, shift10);
631 
632  vy32_l = vec_sub(vy32_l, y_offset);
633  vy32_r = vec_sub(vy32_r, y_offset);
634  vy32_l = vec_mul(vy32_l, y_coeff);
635  vy32_r = vec_mul(vy32_r, y_coeff);
636  vy32_l = vec_add(vy32_l, y_add);
637  vy32_r = vec_add(vy32_r, y_add);
638 
639  R_l = vec_mul(vv32_l, v2r_coeff);
640  R_l = vec_add(R_l, vy32_l);
641  R_r = vec_mul(vv32_r, v2r_coeff);
642  R_r = vec_add(R_r, vy32_r);
643  G_l = vec_mul(vv32_l, v2g_coeff);
644  tmp32 = vec_mul(vu32_l, u2g_coeff);
645  G_l = vec_add(G_l, vy32_l);
646  G_l = vec_add(G_l, tmp32);
647  G_r = vec_mul(vv32_r, v2g_coeff);
648  tmp32 = vec_mul(vu32_r, u2g_coeff);
649  G_r = vec_add(G_r, vy32_r);
650  G_r = vec_add(G_r, tmp32);
651 
652  B_l = vec_mul(vu32_l, u2b_coeff);
653  B_l = vec_add(B_l, vy32_l);
654  B_r = vec_mul(vu32_r, u2b_coeff);
655  B_r = vec_add(B_r, vy32_r);
656 
657  WRITERGB
658  }
659 }
660 
661 #define SETUP(x, buf0, alpha1, buf1, alpha) { \
662  x = vec_ld(0, buf0); \
663  tmp = vec_mule(x, alpha1); \
664  tmp2 = vec_mulo(x, alpha1); \
665  tmp3 = vec_mergeh(tmp, tmp2); \
666  tmp4 = vec_mergel(tmp, tmp2); \
667 \
668  x = vec_ld(0, buf1); \
669  tmp = vec_mule(x, alpha); \
670  tmp2 = vec_mulo(x, alpha); \
671  tmp5 = vec_mergeh(tmp, tmp2); \
672  tmp6 = vec_mergel(tmp, tmp2); \
673 \
674  tmp3 = vec_add(tmp3, tmp5); \
675  tmp4 = vec_add(tmp4, tmp6); \
676 }
677 
678 
679 static av_always_inline void
680 yuv2rgb_full_2_vsx_template(SwsContext *c, const int16_t *buf[2],
681  const int16_t *ubuf[2], const int16_t *vbuf[2],
682  const int16_t *abuf[2], uint8_t *dest, int dstW,
683  int yalpha, int uvalpha, int y,
684  enum AVPixelFormat target, int hasAlpha)
685 {
686  const int16_t *buf0 = buf[0], *buf1 = buf[1],
687  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
688  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
689  *abuf0 = hasAlpha ? abuf[0] : NULL,
690  *abuf1 = hasAlpha ? abuf[1] : NULL;
691  const int16_t yalpha1 = 4096 - yalpha;
692  const int16_t uvalpha1 = 4096 - uvalpha;
693  vec_s16 vy, vu, vv, A = vec_splat_s16(0);
694  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
695  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r;
696  vec_s32 tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
697  vec_u16 rd16, gd16, bd16;
698  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
699  const vec_s16 vyalpha1 = vec_splats(yalpha1);
700  const vec_s16 vuvalpha1 = vec_splats(uvalpha1);
701  const vec_s16 vyalpha = vec_splats((int16_t) yalpha);
702  const vec_s16 vuvalpha = vec_splats((int16_t) uvalpha);
703  const vec_u16 zero16 = vec_splat_u16(0);
704  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
705  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
706  const vec_s32 y_add = vec_splats(1 << 21);
707  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
708  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
709  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
710  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
711  const vec_s32 rgbclip = vec_splats(1 << 30);
712  const vec_s32 zero32 = vec_splat_s32(0);
713  const vec_u32 shift19 = vec_splats(19U);
714  const vec_u32 shift22 = vec_splats(22U);
715  const vec_u32 shift10 = vec_splat_u32(10);
716  const vec_s32 dec128 = vec_splats(128 << 19);
717  const vec_s32 add18 = vec_splats(1 << 18);
718  int i;
719 
720  // Various permutations
721  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
722  0x1, 0x11, 0,
723  0x2, 0x12, 0,
724  0x3, 0x13, 0,
725  0x4, 0x14, 0,
726  0x5 };
727  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
728  0x6, 0x16, 0,
729  0x7, 0x17, 0 };
730  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
731  0x3, 0x4, 0x11,
732  0x6, 0x7, 0x12,
733  0x9, 0xa, 0x13,
734  0xc, 0xd, 0x14,
735  0xf };
736  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
737  0x2, 0x3, 0x16,
738  0x5, 0x6, 0x17 };
739 
740  av_assert2(yalpha <= 4096U);
741  av_assert2(uvalpha <= 4096U);
742 
743  for (i = 0; i < dstW; i += 8) {
744  SETUP(vy, &buf0[i], vyalpha1, &buf1[i], vyalpha);
745  vy32_l = vec_sra(tmp3, shift10);
746  vy32_r = vec_sra(tmp4, shift10);
747 
748  SETUP(vu, &ubuf0[i], vuvalpha1, &ubuf1[i], vuvalpha);
749  tmp3 = vec_sub(tmp3, dec128);
750  tmp4 = vec_sub(tmp4, dec128);
751  vu32_l = vec_sra(tmp3, shift10);
752  vu32_r = vec_sra(tmp4, shift10);
753 
754  SETUP(vv, &vbuf0[i], vuvalpha1, &vbuf1[i], vuvalpha);
755  tmp3 = vec_sub(tmp3, dec128);
756  tmp4 = vec_sub(tmp4, dec128);
757  vv32_l = vec_sra(tmp3, shift10);
758  vv32_r = vec_sra(tmp4, shift10);
759 
760  if (hasAlpha) {
761  SETUP(A, &abuf0[i], vyalpha1, &abuf1[i], vyalpha);
762  tmp3 = vec_add(tmp3, add18);
763  tmp4 = vec_add(tmp4, add18);
764  tmp3 = vec_sra(tmp3, shift19);
765  tmp4 = vec_sra(tmp4, shift19);
766  A = vec_packs(tmp3, tmp4);
767  ad = vec_packsu(A, (vec_s16) zero16);
768  } else {
769  ad = vec_splats((uint8_t) 255);
770  }
771 
772  vy32_l = vec_sub(vy32_l, y_offset);
773  vy32_r = vec_sub(vy32_r, y_offset);
774  vy32_l = vec_mul(vy32_l, y_coeff);
775  vy32_r = vec_mul(vy32_r, y_coeff);
776  vy32_l = vec_add(vy32_l, y_add);
777  vy32_r = vec_add(vy32_r, y_add);
778 
779  R_l = vec_mul(vv32_l, v2r_coeff);
780  R_l = vec_add(R_l, vy32_l);
781  R_r = vec_mul(vv32_r, v2r_coeff);
782  R_r = vec_add(R_r, vy32_r);
783  G_l = vec_mul(vv32_l, v2g_coeff);
784  tmp32 = vec_mul(vu32_l, u2g_coeff);
785  G_l = vec_add(G_l, vy32_l);
786  G_l = vec_add(G_l, tmp32);
787  G_r = vec_mul(vv32_r, v2g_coeff);
788  tmp32 = vec_mul(vu32_r, u2g_coeff);
789  G_r = vec_add(G_r, vy32_r);
790  G_r = vec_add(G_r, tmp32);
791 
792  B_l = vec_mul(vu32_l, u2b_coeff);
793  B_l = vec_add(B_l, vy32_l);
794  B_r = vec_mul(vu32_r, u2b_coeff);
795  B_r = vec_add(B_r, vy32_r);
796 
797  WRITERGB
798  }
799 }
800 
801 static av_always_inline void
802 yuv2rgb_2_vsx_template(SwsContext *c, const int16_t *buf[2],
803  const int16_t *ubuf[2], const int16_t *vbuf[2],
804  const int16_t *abuf[2], uint8_t *dest, int dstW,
805  int yalpha, int uvalpha, int y,
806  enum AVPixelFormat target, int hasAlpha)
807 {
808  const int16_t *buf0 = buf[0], *buf1 = buf[1],
809  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
810  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
811  *abuf0 = hasAlpha ? abuf[0] : NULL,
812  *abuf1 = hasAlpha ? abuf[1] : NULL;
813  const int16_t yalpha1 = 4096 - yalpha;
814  const int16_t uvalpha1 = 4096 - uvalpha;
815  vec_s16 vy, vu, vv, A = vec_splat_s16(0);
816  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
817  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r, vud32_l, vud32_r, vvd32_l, vvd32_r;
818  vec_s32 tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
819  vec_u16 rd16, gd16, bd16;
820  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
821  const vec_s16 vyalpha1 = vec_splats(yalpha1);
822  const vec_s16 vuvalpha1 = vec_splats(uvalpha1);
823  const vec_s16 vyalpha = vec_splats((int16_t) yalpha);
824  const vec_s16 vuvalpha = vec_splats((int16_t) uvalpha);
825  const vec_u16 zero16 = vec_splat_u16(0);
826  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
827  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
828  const vec_s32 y_add = vec_splats(1 << 21);
829  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
830  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
831  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
832  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
833  const vec_s32 rgbclip = vec_splats(1 << 30);
834  const vec_s32 zero32 = vec_splat_s32(0);
835  const vec_u32 shift19 = vec_splats(19U);
836  const vec_u32 shift22 = vec_splats(22U);
837  const vec_u32 shift10 = vec_splat_u32(10);
838  const vec_s32 dec128 = vec_splats(128 << 19);
839  const vec_s32 add18 = vec_splats(1 << 18);
840  int i;
841 
842  // Various permutations
843  const vec_u8 doubleleft = (vec_u8) {0, 1, 2, 3,
844  0, 1, 2, 3,
845  4, 5, 6, 7,
846  4, 5, 6, 7 };
847  const vec_u8 doubleright = (vec_u8) {8, 9, 10, 11,
848  8, 9, 10, 11,
849  12, 13, 14, 15,
850  12, 13, 14, 15 };
851  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
852  0x1, 0x11, 0,
853  0x2, 0x12, 0,
854  0x3, 0x13, 0,
855  0x4, 0x14, 0,
856  0x5 };
857  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
858  0x6, 0x16, 0,
859  0x7, 0x17, 0 };
860  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
861  0x3, 0x4, 0x11,
862  0x6, 0x7, 0x12,
863  0x9, 0xa, 0x13,
864  0xc, 0xd, 0x14,
865  0xf };
866  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
867  0x2, 0x3, 0x16,
868  0x5, 0x6, 0x17 };
869 
870  av_assert2(yalpha <= 4096U);
871  av_assert2(uvalpha <= 4096U);
872 
873  for (i = 0; i < (dstW + 1) >> 1; i += 8) {
874  SETUP(vy, &buf0[i * 2], vyalpha1, &buf1[i * 2], vyalpha);
875  vy32_l = vec_sra(tmp3, shift10);
876  vy32_r = vec_sra(tmp4, shift10);
877 
878  SETUP(vu, &ubuf0[i], vuvalpha1, &ubuf1[i], vuvalpha);
879  tmp3 = vec_sub(tmp3, dec128);
880  tmp4 = vec_sub(tmp4, dec128);
881  vu32_l = vec_sra(tmp3, shift10);
882  vu32_r = vec_sra(tmp4, shift10);
883 
884  SETUP(vv, &vbuf0[i], vuvalpha1, &vbuf1[i], vuvalpha);
885  tmp3 = vec_sub(tmp3, dec128);
886  tmp4 = vec_sub(tmp4, dec128);
887  vv32_l = vec_sra(tmp3, shift10);
888  vv32_r = vec_sra(tmp4, shift10);
889 
890  if (hasAlpha) {
891  SETUP(A, &abuf0[i], vyalpha1, &abuf1[i], vyalpha);
892  tmp3 = vec_add(tmp3, add18);
893  tmp4 = vec_add(tmp4, add18);
894  tmp3 = vec_sra(tmp3, shift19);
895  tmp4 = vec_sra(tmp4, shift19);
896  A = vec_packs(tmp3, tmp4);
897  ad = vec_packsu(A, (vec_s16) zero16);
898  } else {
899  ad = vec_splats((uint8_t) 255);
900  }
901 
902  vy32_l = vec_sub(vy32_l, y_offset);
903  vy32_r = vec_sub(vy32_r, y_offset);
904  vy32_l = vec_mul(vy32_l, y_coeff);
905  vy32_r = vec_mul(vy32_r, y_coeff);
906  vy32_l = vec_add(vy32_l, y_add);
907  vy32_r = vec_add(vy32_r, y_add);
908 
909  // Use the first UV half
910  vud32_l = vec_perm(vu32_l, vu32_l, doubleleft);
911  vud32_r = vec_perm(vu32_l, vu32_l, doubleright);
912  vvd32_l = vec_perm(vv32_l, vv32_l, doubleleft);
913  vvd32_r = vec_perm(vv32_l, vv32_l, doubleright);
914 
915  R_l = vec_mul(vvd32_l, v2r_coeff);
916  R_l = vec_add(R_l, vy32_l);
917  R_r = vec_mul(vvd32_r, v2r_coeff);
918  R_r = vec_add(R_r, vy32_r);
919  G_l = vec_mul(vvd32_l, v2g_coeff);
920  tmp32 = vec_mul(vud32_l, u2g_coeff);
921  G_l = vec_add(G_l, vy32_l);
922  G_l = vec_add(G_l, tmp32);
923  G_r = vec_mul(vvd32_r, v2g_coeff);
924  tmp32 = vec_mul(vud32_r, u2g_coeff);
925  G_r = vec_add(G_r, vy32_r);
926  G_r = vec_add(G_r, tmp32);
927 
928  B_l = vec_mul(vud32_l, u2b_coeff);
929  B_l = vec_add(B_l, vy32_l);
930  B_r = vec_mul(vud32_r, u2b_coeff);
931  B_r = vec_add(B_r, vy32_r);
932 
933  WRITERGB
934 
935  // New Y for the second half
936  SETUP(vy, &buf0[i * 2 + 8], vyalpha1, &buf1[i * 2 + 8], vyalpha);
937  vy32_l = vec_sra(tmp3, shift10);
938  vy32_r = vec_sra(tmp4, shift10);
939 
940  vy32_l = vec_sub(vy32_l, y_offset);
941  vy32_r = vec_sub(vy32_r, y_offset);
942  vy32_l = vec_mul(vy32_l, y_coeff);
943  vy32_r = vec_mul(vy32_r, y_coeff);
944  vy32_l = vec_add(vy32_l, y_add);
945  vy32_r = vec_add(vy32_r, y_add);
946 
947  // Second UV half
948  vud32_l = vec_perm(vu32_r, vu32_r, doubleleft);
949  vud32_r = vec_perm(vu32_r, vu32_r, doubleright);
950  vvd32_l = vec_perm(vv32_r, vv32_r, doubleleft);
951  vvd32_r = vec_perm(vv32_r, vv32_r, doubleright);
952 
953  R_l = vec_mul(vvd32_l, v2r_coeff);
954  R_l = vec_add(R_l, vy32_l);
955  R_r = vec_mul(vvd32_r, v2r_coeff);
956  R_r = vec_add(R_r, vy32_r);
957  G_l = vec_mul(vvd32_l, v2g_coeff);
958  tmp32 = vec_mul(vud32_l, u2g_coeff);
959  G_l = vec_add(G_l, vy32_l);
960  G_l = vec_add(G_l, tmp32);
961  G_r = vec_mul(vvd32_r, v2g_coeff);
962  tmp32 = vec_mul(vud32_r, u2g_coeff);
963  G_r = vec_add(G_r, vy32_r);
964  G_r = vec_add(G_r, tmp32);
965 
966  B_l = vec_mul(vud32_l, u2b_coeff);
967  B_l = vec_add(B_l, vy32_l);
968  B_r = vec_mul(vud32_r, u2b_coeff);
969  B_r = vec_add(B_r, vy32_r);
970 
971  WRITERGB
972  }
973 }
974 
975 #undef SETUP
976 
977 static av_always_inline void
978 yuv2rgb_full_1_vsx_template(SwsContext *c, const int16_t *buf0,
979  const int16_t *ubuf[2], const int16_t *vbuf[2],
980  const int16_t *abuf0, uint8_t *dest, int dstW,
981  int uvalpha, int y, enum AVPixelFormat target,
982  int hasAlpha)
983 {
984  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
985  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
986  vec_s16 vy, vu, vv, A = vec_splat_s16(0), tmp16;
987  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32, tmp32_2;
988  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r;
989  vec_u16 rd16, gd16, bd16;
990  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
991  const vec_u16 zero16 = vec_splat_u16(0);
992  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
993  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
994  const vec_s32 y_add = vec_splats(1 << 21);
995  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
996  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
997  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
998  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
999  const vec_s32 rgbclip = vec_splats(1 << 30);
1000  const vec_s32 zero32 = vec_splat_s32(0);
1001  const vec_u32 shift2 = vec_splat_u32(2);
1002  const vec_u32 shift22 = vec_splats(22U);
1003  const vec_u16 sub7 = vec_splats((uint16_t) (128 << 7));
1004  const vec_u16 sub8 = vec_splats((uint16_t) (128 << 8));
1005  const vec_s16 mul4 = vec_splat_s16(4);
1006  const vec_s16 mul8 = vec_splat_s16(8);
1007  const vec_s16 add64 = vec_splat_s16(64);
1008  const vec_u16 shift7 = vec_splat_u16(7);
1009  const vec_s16 max255 = vec_splat_s16(255);
1010  int i;
1011 
1012  // Various permutations
1013  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
1014  0x1, 0x11, 0,
1015  0x2, 0x12, 0,
1016  0x3, 0x13, 0,
1017  0x4, 0x14, 0,
1018  0x5 };
1019  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
1020  0x6, 0x16, 0,
1021  0x7, 0x17, 0 };
1022  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
1023  0x3, 0x4, 0x11,
1024  0x6, 0x7, 0x12,
1025  0x9, 0xa, 0x13,
1026  0xc, 0xd, 0x14,
1027  0xf };
1028  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
1029  0x2, 0x3, 0x16,
1030  0x5, 0x6, 0x17 };
1031 
1032  for (i = 0; i < dstW; i += 8) { // The x86 asm also overwrites padding bytes.
1033  vy = vec_ld(0, &buf0[i]);
1034  vy32_l = vec_unpackh(vy);
1035  vy32_r = vec_unpackl(vy);
1036  vy32_l = vec_sl(vy32_l, shift2);
1037  vy32_r = vec_sl(vy32_r, shift2);
1038 
1039  vu = vec_ld(0, &ubuf0[i]);
1040  vv = vec_ld(0, &vbuf0[i]);
1041  if (uvalpha < 2048) {
1042  vu = (vec_s16) vec_sub((vec_u16) vu, sub7);
1043  vv = (vec_s16) vec_sub((vec_u16) vv, sub7);
1044 
1045  tmp32 = vec_mule(vu, mul4);
1046  tmp32_2 = vec_mulo(vu, mul4);
1047  vu32_l = vec_mergeh(tmp32, tmp32_2);
1048  vu32_r = vec_mergel(tmp32, tmp32_2);
1049  tmp32 = vec_mule(vv, mul4);
1050  tmp32_2 = vec_mulo(vv, mul4);
1051  vv32_l = vec_mergeh(tmp32, tmp32_2);
1052  vv32_r = vec_mergel(tmp32, tmp32_2);
1053  } else {
1054  tmp16 = vec_ld(0, &ubuf1[i]);
1055  vu = vec_add(vu, tmp16);
1056  vu = (vec_s16) vec_sub((vec_u16) vu, sub8);
1057  tmp16 = vec_ld(0, &vbuf1[i]);
1058  vv = vec_add(vv, tmp16);
1059  vv = (vec_s16) vec_sub((vec_u16) vv, sub8);
1060 
1061  vu32_l = vec_mule(vu, mul8);
1062  vu32_r = vec_mulo(vu, mul8);
1063  vv32_l = vec_mule(vv, mul8);
1064  vv32_r = vec_mulo(vv, mul8);
1065  }
1066 
1067  if (hasAlpha) {
1068  A = vec_ld(0, &abuf0[i]);
1069  A = vec_add(A, add64);
1070  A = vec_sr(A, shift7);
1071  A = vec_max(A, max255);
1072  ad = vec_packsu(A, (vec_s16) zero16);
1073  } else {
1074  ad = vec_splats((uint8_t) 255);
1075  }
1076 
1077  vy32_l = vec_sub(vy32_l, y_offset);
1078  vy32_r = vec_sub(vy32_r, y_offset);
1079  vy32_l = vec_mul(vy32_l, y_coeff);
1080  vy32_r = vec_mul(vy32_r, y_coeff);
1081  vy32_l = vec_add(vy32_l, y_add);
1082  vy32_r = vec_add(vy32_r, y_add);
1083 
1084  R_l = vec_mul(vv32_l, v2r_coeff);
1085  R_l = vec_add(R_l, vy32_l);
1086  R_r = vec_mul(vv32_r, v2r_coeff);
1087  R_r = vec_add(R_r, vy32_r);
1088  G_l = vec_mul(vv32_l, v2g_coeff);
1089  tmp32 = vec_mul(vu32_l, u2g_coeff);
1090  G_l = vec_add(G_l, vy32_l);
1091  G_l = vec_add(G_l, tmp32);
1092  G_r = vec_mul(vv32_r, v2g_coeff);
1093  tmp32 = vec_mul(vu32_r, u2g_coeff);
1094  G_r = vec_add(G_r, vy32_r);
1095  G_r = vec_add(G_r, tmp32);
1096 
1097  B_l = vec_mul(vu32_l, u2b_coeff);
1098  B_l = vec_add(B_l, vy32_l);
1099  B_r = vec_mul(vu32_r, u2b_coeff);
1100  B_r = vec_add(B_r, vy32_r);
1101 
1102  WRITERGB
1103  }
1104 }
1105 
1106 static av_always_inline void
1107 yuv2rgb_1_vsx_template(SwsContext *c, const int16_t *buf0,
1108  const int16_t *ubuf[2], const int16_t *vbuf[2],
1109  const int16_t *abuf0, uint8_t *dest, int dstW,
1110  int uvalpha, int y, enum AVPixelFormat target,
1111  int hasAlpha)
1112 {
1113  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1114  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1115  vec_s16 vy, vu, vv, A = vec_splat_s16(0), tmp16;
1116  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32, tmp32_2;
1117  vec_s32 vud32_l, vud32_r, vvd32_l, vvd32_r;
1118  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r;
1119  vec_u16 rd16, gd16, bd16;
1120  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
1121  const vec_u16 zero16 = vec_splat_u16(0);
1122  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
1123  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
1124  const vec_s32 y_add = vec_splats(1 << 21);
1125  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
1126  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
1127  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
1128  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
1129  const vec_s32 rgbclip = vec_splats(1 << 30);
1130  const vec_s32 zero32 = vec_splat_s32(0);
1131  const vec_u32 shift2 = vec_splat_u32(2);
1132  const vec_u32 shift22 = vec_splats(22U);
1133  const vec_u16 sub7 = vec_splats((uint16_t) (128 << 7));
1134  const vec_u16 sub8 = vec_splats((uint16_t) (128 << 8));
1135  const vec_s16 mul4 = vec_splat_s16(4);
1136  const vec_s16 mul8 = vec_splat_s16(8);
1137  const vec_s16 add64 = vec_splat_s16(64);
1138  const vec_u16 shift7 = vec_splat_u16(7);
1139  const vec_s16 max255 = vec_splat_s16(255);
1140  int i;
1141 
1142  // Various permutations
1143  const vec_u8 doubleleft = (vec_u8) {0, 1, 2, 3,
1144  0, 1, 2, 3,
1145  4, 5, 6, 7,
1146  4, 5, 6, 7 };
1147  const vec_u8 doubleright = (vec_u8) {8, 9, 10, 11,
1148  8, 9, 10, 11,
1149  12, 13, 14, 15,
1150  12, 13, 14, 15 };
1151  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
1152  0x1, 0x11, 0,
1153  0x2, 0x12, 0,
1154  0x3, 0x13, 0,
1155  0x4, 0x14, 0,
1156  0x5 };
1157  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
1158  0x6, 0x16, 0,
1159  0x7, 0x17, 0 };
1160  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
1161  0x3, 0x4, 0x11,
1162  0x6, 0x7, 0x12,
1163  0x9, 0xa, 0x13,
1164  0xc, 0xd, 0x14,
1165  0xf };
1166  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
1167  0x2, 0x3, 0x16,
1168  0x5, 0x6, 0x17 };
1169 
1170  for (i = 0; i < (dstW + 1) >> 1; i += 8) { // The x86 asm also overwrites padding bytes.
1171  vy = vec_ld(0, &buf0[i * 2]);
1172  vy32_l = vec_unpackh(vy);
1173  vy32_r = vec_unpackl(vy);
1174  vy32_l = vec_sl(vy32_l, shift2);
1175  vy32_r = vec_sl(vy32_r, shift2);
1176 
1177  vu = vec_ld(0, &ubuf0[i]);
1178  vv = vec_ld(0, &vbuf0[i]);
1179  if (uvalpha < 2048) {
1180  vu = (vec_s16) vec_sub((vec_u16) vu, sub7);
1181  vv = (vec_s16) vec_sub((vec_u16) vv, sub7);
1182 
1183  tmp32 = vec_mule(vu, mul4);
1184  tmp32_2 = vec_mulo(vu, mul4);
1185  vu32_l = vec_mergeh(tmp32, tmp32_2);
1186  vu32_r = vec_mergel(tmp32, tmp32_2);
1187  tmp32 = vec_mule(vv, mul4);
1188  tmp32_2 = vec_mulo(vv, mul4);
1189  vv32_l = vec_mergeh(tmp32, tmp32_2);
1190  vv32_r = vec_mergel(tmp32, tmp32_2);
1191  } else {
1192  tmp16 = vec_ld(0, &ubuf1[i]);
1193  vu = vec_add(vu, tmp16);
1194  vu = (vec_s16) vec_sub((vec_u16) vu, sub8);
1195  tmp16 = vec_ld(0, &vbuf1[i]);
1196  vv = vec_add(vv, tmp16);
1197  vv = (vec_s16) vec_sub((vec_u16) vv, sub8);
1198 
1199  vu32_l = vec_mule(vu, mul8);
1200  vu32_r = vec_mulo(vu, mul8);
1201  vv32_l = vec_mule(vv, mul8);
1202  vv32_r = vec_mulo(vv, mul8);
1203  }
1204 
1205  if (hasAlpha) {
1206  A = vec_ld(0, &abuf0[i]);
1207  A = vec_add(A, add64);
1208  A = vec_sr(A, shift7);
1209  A = vec_max(A, max255);
1210  ad = vec_packsu(A, (vec_s16) zero16);
1211  } else {
1212  ad = vec_splats((uint8_t) 255);
1213  }
1214 
1215  vy32_l = vec_sub(vy32_l, y_offset);
1216  vy32_r = vec_sub(vy32_r, y_offset);
1217  vy32_l = vec_mul(vy32_l, y_coeff);
1218  vy32_r = vec_mul(vy32_r, y_coeff);
1219  vy32_l = vec_add(vy32_l, y_add);
1220  vy32_r = vec_add(vy32_r, y_add);
1221 
1222  // Use the first UV half
1223  vud32_l = vec_perm(vu32_l, vu32_l, doubleleft);
1224  vud32_r = vec_perm(vu32_l, vu32_l, doubleright);
1225  vvd32_l = vec_perm(vv32_l, vv32_l, doubleleft);
1226  vvd32_r = vec_perm(vv32_l, vv32_l, doubleright);
1227 
1228  R_l = vec_mul(vvd32_l, v2r_coeff);
1229  R_l = vec_add(R_l, vy32_l);
1230  R_r = vec_mul(vvd32_r, v2r_coeff);
1231  R_r = vec_add(R_r, vy32_r);
1232  G_l = vec_mul(vvd32_l, v2g_coeff);
1233  tmp32 = vec_mul(vud32_l, u2g_coeff);
1234  G_l = vec_add(G_l, vy32_l);
1235  G_l = vec_add(G_l, tmp32);
1236  G_r = vec_mul(vvd32_r, v2g_coeff);
1237  tmp32 = vec_mul(vud32_r, u2g_coeff);
1238  G_r = vec_add(G_r, vy32_r);
1239  G_r = vec_add(G_r, tmp32);
1240 
1241  B_l = vec_mul(vud32_l, u2b_coeff);
1242  B_l = vec_add(B_l, vy32_l);
1243  B_r = vec_mul(vud32_r, u2b_coeff);
1244  B_r = vec_add(B_r, vy32_r);
1245 
1246  WRITERGB
1247 
1248  // New Y for the second half
1249  vy = vec_ld(16, &buf0[i * 2]);
1250  vy32_l = vec_unpackh(vy);
1251  vy32_r = vec_unpackl(vy);
1252  vy32_l = vec_sl(vy32_l, shift2);
1253  vy32_r = vec_sl(vy32_r, shift2);
1254 
1255  vy32_l = vec_sub(vy32_l, y_offset);
1256  vy32_r = vec_sub(vy32_r, y_offset);
1257  vy32_l = vec_mul(vy32_l, y_coeff);
1258  vy32_r = vec_mul(vy32_r, y_coeff);
1259  vy32_l = vec_add(vy32_l, y_add);
1260  vy32_r = vec_add(vy32_r, y_add);
1261 
1262  // Second UV half
1263  vud32_l = vec_perm(vu32_r, vu32_r, doubleleft);
1264  vud32_r = vec_perm(vu32_r, vu32_r, doubleright);
1265  vvd32_l = vec_perm(vv32_r, vv32_r, doubleleft);
1266  vvd32_r = vec_perm(vv32_r, vv32_r, doubleright);
1267 
1268  R_l = vec_mul(vvd32_l, v2r_coeff);
1269  R_l = vec_add(R_l, vy32_l);
1270  R_r = vec_mul(vvd32_r, v2r_coeff);
1271  R_r = vec_add(R_r, vy32_r);
1272  G_l = vec_mul(vvd32_l, v2g_coeff);
1273  tmp32 = vec_mul(vud32_l, u2g_coeff);
1274  G_l = vec_add(G_l, vy32_l);
1275  G_l = vec_add(G_l, tmp32);
1276  G_r = vec_mul(vvd32_r, v2g_coeff);
1277  tmp32 = vec_mul(vud32_r, u2g_coeff);
1278  G_r = vec_add(G_r, vy32_r);
1279  G_r = vec_add(G_r, tmp32);
1280 
1281  B_l = vec_mul(vud32_l, u2b_coeff);
1282  B_l = vec_add(B_l, vy32_l);
1283  B_r = vec_mul(vud32_r, u2b_coeff);
1284  B_r = vec_add(B_r, vy32_r);
1285 
1286  WRITERGB
1287  }
1288 }
1289 
1290 #undef WRITERGB
1291 
1292 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1293 static void name ## ext ## _X_vsx(SwsContext *c, const int16_t *lumFilter, \
1294  const int16_t **lumSrc, int lumFilterSize, \
1295  const int16_t *chrFilter, const int16_t **chrUSrc, \
1296  const int16_t **chrVSrc, int chrFilterSize, \
1297  const int16_t **alpSrc, uint8_t *dest, int dstW, \
1298  int y) \
1299 { \
1300  name ## base ## _X_vsx_template(c, lumFilter, lumSrc, lumFilterSize, \
1301  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1302  alpSrc, dest, dstW, y, fmt, hasAlpha); \
1303 }
1304 
1305 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
1306 static void name ## ext ## _2_vsx(SwsContext *c, const int16_t *buf[2], \
1307  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1308  const int16_t *abuf[2], uint8_t *dest, int dstW, \
1309  int yalpha, int uvalpha, int y) \
1310 { \
1311  name ## base ## _2_vsx_template(c, buf, ubuf, vbuf, abuf, \
1312  dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1313 }
1314 
1315 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1316 static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
1317  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1318  const int16_t *abuf0, uint8_t *dest, int dstW, \
1319  int uvalpha, int y) \
1320 { \
1321  name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1322  dstW, uvalpha, y, fmt, hasAlpha); \
1323 }
1324 
1325 YUV2RGBWRAPPER(yuv2, rgb, bgrx32, AV_PIX_FMT_BGRA, 0)
1326 YUV2RGBWRAPPER(yuv2, rgb, rgbx32, AV_PIX_FMT_RGBA, 0)
1327 YUV2RGBWRAPPER(yuv2, rgb, xrgb32, AV_PIX_FMT_ARGB, 0)
1328 YUV2RGBWRAPPER(yuv2, rgb, xbgr32, AV_PIX_FMT_ABGR, 0)
1329 
1330 YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
1331 YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
1332 
1333 YUV2RGBWRAPPERX2(yuv2, rgb, bgrx32, AV_PIX_FMT_BGRA, 0)
1334 YUV2RGBWRAPPERX2(yuv2, rgb, rgbx32, AV_PIX_FMT_RGBA, 0)
1335 YUV2RGBWRAPPERX2(yuv2, rgb, xrgb32, AV_PIX_FMT_ARGB, 0)
1336 YUV2RGBWRAPPERX2(yuv2, rgb, xbgr32, AV_PIX_FMT_ABGR, 0)
1337 
1338 YUV2RGBWRAPPERX2(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
1339 YUV2RGBWRAPPERX2(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
1340 
1341 YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1342 YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1343 YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1344 YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1345 
1346 YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1347 YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1348 
1349 YUV2RGBWRAPPERX2(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1350 YUV2RGBWRAPPERX2(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1351 YUV2RGBWRAPPERX2(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1352 YUV2RGBWRAPPERX2(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1353 
1354 YUV2RGBWRAPPERX2(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1355 YUV2RGBWRAPPERX2(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1356 
1357 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1358 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1359 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1360 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1361 
1362 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1363 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1364 
1365 static av_always_inline void
1366 write422(const vec_s16 vy1, const vec_s16 vy2,
1367  const vec_s16 vu, const vec_s16 vv,
1368  uint8_t *dest, const enum AVPixelFormat target)
1369 {
1370  vec_u8 vd1, vd2, tmp;
1371  const vec_u8 yuyv1 = (vec_u8) {
1372  0x0, 0x10, 0x1, 0x18,
1373  0x2, 0x11, 0x3, 0x19,
1374  0x4, 0x12, 0x5, 0x1a,
1375  0x6, 0x13, 0x7, 0x1b };
1376  const vec_u8 yuyv2 = (vec_u8) {
1377  0x8, 0x14, 0x9, 0x1c,
1378  0xa, 0x15, 0xb, 0x1d,
1379  0xc, 0x16, 0xd, 0x1e,
1380  0xe, 0x17, 0xf, 0x1f };
1381  const vec_u8 yvyu1 = (vec_u8) {
1382  0x0, 0x18, 0x1, 0x10,
1383  0x2, 0x19, 0x3, 0x11,
1384  0x4, 0x1a, 0x5, 0x12,
1385  0x6, 0x1b, 0x7, 0x13 };
1386  const vec_u8 yvyu2 = (vec_u8) {
1387  0x8, 0x1c, 0x9, 0x14,
1388  0xa, 0x1d, 0xb, 0x15,
1389  0xc, 0x1e, 0xd, 0x16,
1390  0xe, 0x1f, 0xf, 0x17 };
1391  const vec_u8 uyvy1 = (vec_u8) {
1392  0x10, 0x0, 0x18, 0x1,
1393  0x11, 0x2, 0x19, 0x3,
1394  0x12, 0x4, 0x1a, 0x5,
1395  0x13, 0x6, 0x1b, 0x7 };
1396  const vec_u8 uyvy2 = (vec_u8) {
1397  0x14, 0x8, 0x1c, 0x9,
1398  0x15, 0xa, 0x1d, 0xb,
1399  0x16, 0xc, 0x1e, 0xd,
1400  0x17, 0xe, 0x1f, 0xf };
1401 
1402  vd1 = vec_packsu(vy1, vy2);
1403  vd2 = vec_packsu(vu, vv);
1404 
1405  switch (target) {
1406  case AV_PIX_FMT_YUYV422:
1407  tmp = vec_perm(vd1, vd2, yuyv1);
1408  vec_st(tmp, 0, dest);
1409  tmp = vec_perm(vd1, vd2, yuyv2);
1410  vec_st(tmp, 16, dest);
1411  break;
1412  case AV_PIX_FMT_YVYU422:
1413  tmp = vec_perm(vd1, vd2, yvyu1);
1414  vec_st(tmp, 0, dest);
1415  tmp = vec_perm(vd1, vd2, yvyu2);
1416  vec_st(tmp, 16, dest);
1417  break;
1418  case AV_PIX_FMT_UYVY422:
1419  tmp = vec_perm(vd1, vd2, uyvy1);
1420  vec_st(tmp, 0, dest);
1421  tmp = vec_perm(vd1, vd2, uyvy2);
1422  vec_st(tmp, 16, dest);
1423  break;
1424  }
1425 }
1426 
1427 static av_always_inline void
1428 yuv2422_X_vsx_template(SwsContext *c, const int16_t *lumFilter,
1429  const int16_t **lumSrc, int lumFilterSize,
1430  const int16_t *chrFilter, const int16_t **chrUSrc,
1431  const int16_t **chrVSrc, int chrFilterSize,
1432  const int16_t **alpSrc, uint8_t *dest, int dstW,
1433  int y, enum AVPixelFormat target)
1434 {
1435  int i, j;
1436  vec_s16 vy1, vy2, vu, vv;
1437  vec_s32 vy32[4], vu32[2], vv32[2], tmp, tmp2, tmp3, tmp4;
1438  vec_s16 vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
1439  const vec_s32 start = vec_splats(1 << 18);
1440  const vec_u32 shift19 = vec_splats(19U);
1441 
1442  for (i = 0; i < lumFilterSize; i++)
1443  vlumFilter[i] = vec_splats(lumFilter[i]);
1444  for (i = 0; i < chrFilterSize; i++)
1445  vchrFilter[i] = vec_splats(chrFilter[i]);
1446 
1447  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1448  vy32[0] =
1449  vy32[1] =
1450  vy32[2] =
1451  vy32[3] =
1452  vu32[0] =
1453  vu32[1] =
1454  vv32[0] =
1455  vv32[1] = start;
1456 
1457  for (j = 0; j < lumFilterSize; j++) {
1458  vv = vec_ld(0, &lumSrc[j][i * 2]);
1459  tmp = vec_mule(vv, vlumFilter[j]);
1460  tmp2 = vec_mulo(vv, vlumFilter[j]);
1461  tmp3 = vec_mergeh(tmp, tmp2);
1462  tmp4 = vec_mergel(tmp, tmp2);
1463 
1464  vy32[0] = vec_adds(vy32[0], tmp3);
1465  vy32[1] = vec_adds(vy32[1], tmp4);
1466 
1467  vv = vec_ld(0, &lumSrc[j][(i + 4) * 2]);
1468  tmp = vec_mule(vv, vlumFilter[j]);
1469  tmp2 = vec_mulo(vv, vlumFilter[j]);
1470  tmp3 = vec_mergeh(tmp, tmp2);
1471  tmp4 = vec_mergel(tmp, tmp2);
1472 
1473  vy32[2] = vec_adds(vy32[2], tmp3);
1474  vy32[3] = vec_adds(vy32[3], tmp4);
1475  }
1476 
1477  for (j = 0; j < chrFilterSize; j++) {
1478  vv = vec_ld(0, &chrUSrc[j][i]);
1479  tmp = vec_mule(vv, vchrFilter[j]);
1480  tmp2 = vec_mulo(vv, vchrFilter[j]);
1481  tmp3 = vec_mergeh(tmp, tmp2);
1482  tmp4 = vec_mergel(tmp, tmp2);
1483 
1484  vu32[0] = vec_adds(vu32[0], tmp3);
1485  vu32[1] = vec_adds(vu32[1], tmp4);
1486 
1487  vv = vec_ld(0, &chrVSrc[j][i]);
1488  tmp = vec_mule(vv, vchrFilter[j]);
1489  tmp2 = vec_mulo(vv, vchrFilter[j]);
1490  tmp3 = vec_mergeh(tmp, tmp2);
1491  tmp4 = vec_mergel(tmp, tmp2);
1492 
1493  vv32[0] = vec_adds(vv32[0], tmp3);
1494  vv32[1] = vec_adds(vv32[1], tmp4);
1495  }
1496 
1497  for (j = 0; j < 4; j++) {
1498  vy32[j] = vec_sra(vy32[j], shift19);
1499  }
1500  for (j = 0; j < 2; j++) {
1501  vu32[j] = vec_sra(vu32[j], shift19);
1502  vv32[j] = vec_sra(vv32[j], shift19);
1503  }
1504 
1505  vy1 = vec_packs(vy32[0], vy32[1]);
1506  vy2 = vec_packs(vy32[2], vy32[3]);
1507  vu = vec_packs(vu32[0], vu32[1]);
1508  vv = vec_packs(vv32[0], vv32[1]);
1509 
1510  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1511  }
1512 }
1513 
1514 #define SETUP(x, buf0, buf1, alpha) { \
1515  x = vec_ld(0, buf0); \
1516  tmp = vec_mule(x, alpha); \
1517  tmp2 = vec_mulo(x, alpha); \
1518  tmp3 = vec_mergeh(tmp, tmp2); \
1519  tmp4 = vec_mergel(tmp, tmp2); \
1520 \
1521  x = vec_ld(0, buf1); \
1522  tmp = vec_mule(x, alpha); \
1523  tmp2 = vec_mulo(x, alpha); \
1524  tmp5 = vec_mergeh(tmp, tmp2); \
1525  tmp6 = vec_mergel(tmp, tmp2); \
1526 \
1527  tmp3 = vec_add(tmp3, tmp5); \
1528  tmp4 = vec_add(tmp4, tmp6); \
1529 \
1530  tmp3 = vec_sra(tmp3, shift19); \
1531  tmp4 = vec_sra(tmp4, shift19); \
1532  x = vec_packs(tmp3, tmp4); \
1533 }
1534 
1535 static av_always_inline void
1536 yuv2422_2_vsx_template(SwsContext *c, const int16_t *buf[2],
1537  const int16_t *ubuf[2], const int16_t *vbuf[2],
1538  const int16_t *abuf[2], uint8_t *dest, int dstW,
1539  int yalpha, int uvalpha, int y,
1540  enum AVPixelFormat target)
1541 {
1542  const int16_t *buf0 = buf[0], *buf1 = buf[1],
1543  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1544  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1545  const int16_t yalpha1 = 4096 - yalpha;
1546  const int16_t uvalpha1 = 4096 - uvalpha;
1547  vec_s16 vy1, vy2, vu, vv;
1548  vec_s32 tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
1549  const vec_s16 vyalpha1 = vec_splats(yalpha1);
1550  const vec_s16 vuvalpha1 = vec_splats(uvalpha1);
1551  const vec_u32 shift19 = vec_splats(19U);
1552  int i;
1553  av_assert2(yalpha <= 4096U);
1554  av_assert2(uvalpha <= 4096U);
1555 
1556  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1557 
1558  SETUP(vy1, &buf0[i * 2], &buf1[i * 2], vyalpha1)
1559  SETUP(vy2, &buf0[(i + 4) * 2], &buf1[(i + 4) * 2], vyalpha1)
1560  SETUP(vu, &ubuf0[i], &ubuf1[i], vuvalpha1)
1561  SETUP(vv, &vbuf0[i], &vbuf1[i], vuvalpha1)
1562 
1563  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1564  }
1565 }
1566 
1567 #undef SETUP
1568 
1569 static av_always_inline void
1570 yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
1571  const int16_t *ubuf[2], const int16_t *vbuf[2],
1572  const int16_t *abuf0, uint8_t *dest, int dstW,
1573  int uvalpha, int y, enum AVPixelFormat target)
1574 {
1575  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1576  vec_s16 vy1, vy2, vu, vv, tmp;
1577  const vec_s16 add64 = vec_splats((int16_t) 64);
1578  const vec_s16 add128 = vec_splats((int16_t) 128);
1579  const vec_u16 shift7 = vec_splat_u16(7);
1580  const vec_u16 shift8 = vec_splat_u16(8);
1581  int i;
1582 
1583  if (uvalpha < 2048) {
1584  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1585  vy1 = vec_ld(0, &buf0[i * 2]);
1586  vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
1587  vu = vec_ld(0, &ubuf0[i]);
1588  vv = vec_ld(0, &vbuf0[i]);
1589 
1590  vy1 = vec_add(vy1, add64);
1591  vy2 = vec_add(vy2, add64);
1592  vu = vec_add(vu, add64);
1593  vv = vec_add(vv, add64);
1594 
1595  vy1 = vec_sra(vy1, shift7);
1596  vy2 = vec_sra(vy2, shift7);
1597  vu = vec_sra(vu, shift7);
1598  vv = vec_sra(vv, shift7);
1599 
1600  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1601  }
1602  } else {
1603  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1604  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1605  vy1 = vec_ld(0, &buf0[i * 2]);
1606  vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
1607  vu = vec_ld(0, &ubuf0[i]);
1608  tmp = vec_ld(0, &ubuf1[i]);
1609  vu = vec_adds(vu, tmp);
1610  vv = vec_ld(0, &vbuf0[i]);
1611  tmp = vec_ld(0, &vbuf1[i]);
1612  vv = vec_adds(vv, tmp);
1613 
1614  vy1 = vec_add(vy1, add64);
1615  vy2 = vec_add(vy2, add64);
1616  vu = vec_adds(vu, add128);
1617  vv = vec_adds(vv, add128);
1618 
1619  vy1 = vec_sra(vy1, shift7);
1620  vy2 = vec_sra(vy2, shift7);
1621  vu = vec_sra(vu, shift8);
1622  vv = vec_sra(vv, shift8);
1623 
1624  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1625  }
1626  }
1627 }
1628 
1629 #define YUV2PACKEDWRAPPERX(name, base, ext, fmt) \
1630 static void name ## ext ## _X_vsx(SwsContext *c, const int16_t *lumFilter, \
1631  const int16_t **lumSrc, int lumFilterSize, \
1632  const int16_t *chrFilter, const int16_t **chrUSrc, \
1633  const int16_t **chrVSrc, int chrFilterSize, \
1634  const int16_t **alpSrc, uint8_t *dest, int dstW, \
1635  int y) \
1636 { \
1637  name ## base ## _X_vsx_template(c, lumFilter, lumSrc, lumFilterSize, \
1638  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1639  alpSrc, dest, dstW, y, fmt); \
1640 }
1641 
1642 #define YUV2PACKEDWRAPPER2(name, base, ext, fmt) \
1643 YUV2PACKEDWRAPPERX(name, base, ext, fmt) \
1644 static void name ## ext ## _2_vsx(SwsContext *c, const int16_t *buf[2], \
1645  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1646  const int16_t *abuf[2], uint8_t *dest, int dstW, \
1647  int yalpha, int uvalpha, int y) \
1648 { \
1649  name ## base ## _2_vsx_template(c, buf, ubuf, vbuf, abuf, \
1650  dest, dstW, yalpha, uvalpha, y, fmt); \
1651 }
1652 
1653 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
1654 YUV2PACKEDWRAPPER2(name, base, ext, fmt) \
1655 static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
1656  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1657  const int16_t *abuf0, uint8_t *dest, int dstW, \
1658  int uvalpha, int y) \
1659 { \
1660  name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, \
1661  abuf0, dest, dstW, uvalpha, \
1662  y, fmt); \
1663 }
1664 
1665 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422)
1666 YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422)
1667 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
1668 
1669 static void hyscale_fast_vsx(SwsContext *c, int16_t *dst, int dstWidth,
1670  const uint8_t *src, int srcW, int xInc)
1671 {
1672  int i;
1673  unsigned int xpos = 0, xx;
1674  vec_u8 vin, vin2, vperm;
1675  vec_s8 vmul, valpha;
1676  vec_s16 vtmp, vtmp2, vtmp3, vtmp4;
1677  vec_u16 vd_l, vd_r, vcoord16[2];
1678  vec_u32 vcoord[4];
1679  const vec_u32 vadd = (vec_u32) {
1680  0,
1681  xInc * 1,
1682  xInc * 2,
1683  xInc * 3,
1684  };
1685  const vec_u16 vadd16 = (vec_u16) { // Modulo math
1686  0,
1687  xInc * 1,
1688  xInc * 2,
1689  xInc * 3,
1690  xInc * 4,
1691  xInc * 5,
1692  xInc * 6,
1693  xInc * 7,
1694  };
1695  const vec_u32 vshift16 = vec_splats((uint32_t) 16);
1696  const vec_u16 vshift9 = vec_splat_u16(9);
1697  const vec_u8 vzero = vec_splat_u8(0);
1698  const vec_u16 vshift = vec_splat_u16(7);
1699 
1700  for (i = 0; i < dstWidth; i += 16) {
1701  vcoord16[0] = vec_splats((uint16_t) xpos);
1702  vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8));
1703 
1704  vcoord16[0] = vec_add(vcoord16[0], vadd16);
1705  vcoord16[1] = vec_add(vcoord16[1], vadd16);
1706 
1707  vcoord16[0] = vec_sr(vcoord16[0], vshift9);
1708  vcoord16[1] = vec_sr(vcoord16[1], vshift9);
1709  valpha = (vec_s8) vec_pack(vcoord16[0], vcoord16[1]);
1710 
1711  xx = xpos >> 16;
1712  vin = vec_vsx_ld(0, &src[xx]);
1713 
1714  vcoord[0] = vec_splats(xpos & 0xffff);
1715  vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4);
1716  vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8);
1717  vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12);
1718 
1719  vcoord[0] = vec_add(vcoord[0], vadd);
1720  vcoord[1] = vec_add(vcoord[1], vadd);
1721  vcoord[2] = vec_add(vcoord[2], vadd);
1722  vcoord[3] = vec_add(vcoord[3], vadd);
1723 
1724  vcoord[0] = vec_sr(vcoord[0], vshift16);
1725  vcoord[1] = vec_sr(vcoord[1], vshift16);
1726  vcoord[2] = vec_sr(vcoord[2], vshift16);
1727  vcoord[3] = vec_sr(vcoord[3], vshift16);
1728 
1729  vcoord16[0] = vec_pack(vcoord[0], vcoord[1]);
1730  vcoord16[1] = vec_pack(vcoord[2], vcoord[3]);
1731  vperm = vec_pack(vcoord16[0], vcoord16[1]);
1732 
1733  vin = vec_perm(vin, vin, vperm);
1734 
1735  vin2 = vec_vsx_ld(1, &src[xx]);
1736  vin2 = vec_perm(vin2, vin2, vperm);
1737 
1738  vmul = (vec_s8) vec_sub(vin2, vin);
1739  vtmp = vec_mule(vmul, valpha);
1740  vtmp2 = vec_mulo(vmul, valpha);
1741  vtmp3 = vec_mergeh(vtmp, vtmp2);
1742  vtmp4 = vec_mergel(vtmp, vtmp2);
1743 
1744  vd_l = (vec_u16) vec_mergeh(vin, vzero);
1745  vd_r = (vec_u16) vec_mergel(vin, vzero);
1746  vd_l = vec_sl(vd_l, vshift);
1747  vd_r = vec_sl(vd_r, vshift);
1748 
1749  vd_l = vec_add(vd_l, (vec_u16) vtmp3);
1750  vd_r = vec_add(vd_r, (vec_u16) vtmp4);
1751 
1752  vec_st((vec_s16) vd_l, 0, &dst[i]);
1753  vec_st((vec_s16) vd_r, 0, &dst[i + 8]);
1754 
1755  xpos += xInc * 16;
1756  }
1757  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
1758  dst[i] = src[srcW-1]*128;
1759 }
1760 
1761 #define HCSCALE(in, out) \
1762  vin = vec_vsx_ld(0, &in[xx]); \
1763  vin = vec_perm(vin, vin, vperm); \
1764 \
1765  vin2 = vec_vsx_ld(1, &in[xx]); \
1766  vin2 = vec_perm(vin2, vin2, vperm); \
1767 \
1768  vtmp = vec_mule(vin, valphaxor); \
1769  vtmp2 = vec_mulo(vin, valphaxor); \
1770  vtmp3 = vec_mergeh(vtmp, vtmp2); \
1771  vtmp4 = vec_mergel(vtmp, vtmp2); \
1772 \
1773  vtmp = vec_mule(vin2, valpha); \
1774  vtmp2 = vec_mulo(vin2, valpha); \
1775  vd_l = vec_mergeh(vtmp, vtmp2); \
1776  vd_r = vec_mergel(vtmp, vtmp2); \
1777 \
1778  vd_l = vec_add(vd_l, vtmp3); \
1779  vd_r = vec_add(vd_r, vtmp4); \
1780 \
1781  vec_st((vec_s16) vd_l, 0, &out[i]); \
1782  vec_st((vec_s16) vd_r, 0, &out[i + 8])
1783 
1784 static void hcscale_fast_vsx(SwsContext *c, int16_t *dst1, int16_t *dst2,
1785  int dstWidth, const uint8_t *src1,
1786  const uint8_t *src2, int srcW, int xInc)
1787 {
1788  int i;
1789  unsigned int xpos = 0, xx;
1790  vec_u8 vin, vin2, vperm;
1791  vec_u8 valpha, valphaxor;
1792  vec_u16 vtmp, vtmp2, vtmp3, vtmp4;
1793  vec_u16 vd_l, vd_r, vcoord16[2];
1794  vec_u32 vcoord[4];
1795  const vec_u8 vxor = vec_splats((uint8_t) 127);
1796  const vec_u32 vadd = (vec_u32) {
1797  0,
1798  xInc * 1,
1799  xInc * 2,
1800  xInc * 3,
1801  };
1802  const vec_u16 vadd16 = (vec_u16) { // Modulo math
1803  0,
1804  xInc * 1,
1805  xInc * 2,
1806  xInc * 3,
1807  xInc * 4,
1808  xInc * 5,
1809  xInc * 6,
1810  xInc * 7,
1811  };
1812  const vec_u32 vshift16 = vec_splats((uint32_t) 16);
1813  const vec_u16 vshift9 = vec_splat_u16(9);
1814 
1815  for (i = 0; i < dstWidth; i += 16) {
1816  vcoord16[0] = vec_splats((uint16_t) xpos);
1817  vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8));
1818 
1819  vcoord16[0] = vec_add(vcoord16[0], vadd16);
1820  vcoord16[1] = vec_add(vcoord16[1], vadd16);
1821 
1822  vcoord16[0] = vec_sr(vcoord16[0], vshift9);
1823  vcoord16[1] = vec_sr(vcoord16[1], vshift9);
1824  valpha = vec_pack(vcoord16[0], vcoord16[1]);
1825  valphaxor = vec_xor(valpha, vxor);
1826 
1827  xx = xpos >> 16;
1828 
1829  vcoord[0] = vec_splats(xpos & 0xffff);
1830  vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4);
1831  vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8);
1832  vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12);
1833 
1834  vcoord[0] = vec_add(vcoord[0], vadd);
1835  vcoord[1] = vec_add(vcoord[1], vadd);
1836  vcoord[2] = vec_add(vcoord[2], vadd);
1837  vcoord[3] = vec_add(vcoord[3], vadd);
1838 
1839  vcoord[0] = vec_sr(vcoord[0], vshift16);
1840  vcoord[1] = vec_sr(vcoord[1], vshift16);
1841  vcoord[2] = vec_sr(vcoord[2], vshift16);
1842  vcoord[3] = vec_sr(vcoord[3], vshift16);
1843 
1844  vcoord16[0] = vec_pack(vcoord[0], vcoord[1]);
1845  vcoord16[1] = vec_pack(vcoord[2], vcoord[3]);
1846  vperm = vec_pack(vcoord16[0], vcoord16[1]);
1847 
1848  HCSCALE(src1, dst1);
1849  HCSCALE(src2, dst2);
1850 
1851  xpos += xInc * 16;
1852  }
1853  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
1854  dst1[i] = src1[srcW-1]*128;
1855  dst2[i] = src2[srcW-1]*128;
1856  }
1857 }
1858 
1859 #undef HCSCALE
1860 
1861 static void hScale8To19_vsx(SwsContext *c, int16_t *_dst, int dstW,
1862  const uint8_t *src, const int16_t *filter,
1863  const int32_t *filterPos, int filterSize)
1864 {
1865  int i, j;
1866  int32_t *dst = (int32_t *) _dst;
1867  vec_s16 vfilter, vin;
1868  vec_u8 vin8;
1869  vec_s32 vout;
1870  const vec_u8 vzero = vec_splat_u8(0);
1871  const vec_u8 vunusedtab[8] = {
1872  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1873  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
1874  (vec_u8) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1875  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1876  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
1877  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1878  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
1879  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1880  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1881  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1882  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1883  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1884  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1885  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
1886  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1887  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
1888  };
1889  const vec_u8 vunused = vunusedtab[filterSize % 8];
1890 
1891  if (filterSize == 1) {
1892  for (i = 0; i < dstW; i++) {
1893  int srcPos = filterPos[i];
1894  int val = 0;
1895  for (j = 0; j < filterSize; j++) {
1896  val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
1897  }
1898  dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...
1899  }
1900  } else {
1901  for (i = 0; i < dstW; i++) {
1902  const int srcPos = filterPos[i];
1903  vout = vec_splat_s32(0);
1904  for (j = 0; j < filterSize; j += 8) {
1905  vin8 = vec_vsx_ld(0, &src[srcPos + j]);
1906  vin = (vec_s16) vec_mergeh(vin8, vzero);
1907  if (j + 8 > filterSize) // Remove the unused elements on the last round
1908  vin = vec_perm(vin, (vec_s16) vzero, vunused);
1909 
1910  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
1911  vout = vec_msums(vin, vfilter, vout);
1912  }
1913  vout = vec_sums(vout, (vec_s32) vzero);
1914  dst[i] = FFMIN(vout[3] >> 3, (1 << 19) - 1);
1915  }
1916  }
1917 }
1918 
1919 static void hScale16To19_vsx(SwsContext *c, int16_t *_dst, int dstW,
1920  const uint8_t *_src, const int16_t *filter,
1921  const int32_t *filterPos, int filterSize)
1922 {
1923  const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
1924  int i, j;
1925  int32_t *dst = (int32_t *) _dst;
1926  const uint16_t *src = (const uint16_t *) _src;
1927  int bits = desc->comp[0].depth - 1;
1928  int sh = bits - 4;
1929  vec_s16 vfilter, vin;
1930  vec_s32 vout, vtmp, vtmp2, vfilter32_l, vfilter32_r;
1931  const vec_u8 vzero = vec_splat_u8(0);
1932  const vec_u8 vunusedtab[8] = {
1933  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1934  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
1935  (vec_u8) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1936  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1937  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
1938  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1939  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
1940  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1941  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1942  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1943  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1944  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1945  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1946  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
1947  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1948  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
1949  };
1950  const vec_u8 vunused = vunusedtab[filterSize % 8];
1951 
1952  if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
1953  sh = 9;
1954  } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
1955  sh = 16 - 1 - 4;
1956  }
1957 
1958  if (filterSize == 1) {
1959  for (i = 0; i < dstW; i++) {
1960  int srcPos = filterPos[i];
1961  int val = 0;
1962 
1963  for (j = 0; j < filterSize; j++) {
1964  val += src[srcPos + j] * filter[filterSize * i + j];
1965  }
1966  // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
1967  dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
1968  }
1969  } else {
1970  for (i = 0; i < dstW; i++) {
1971  const int srcPos = filterPos[i];
1972  vout = vec_splat_s32(0);
1973  for (j = 0; j < filterSize; j += 8) {
1974  vin = (vec_s16) vec_vsx_ld(0, &src[srcPos + j]);
1975  if (j + 8 > filterSize) // Remove the unused elements on the last round
1976  vin = vec_perm(vin, (vec_s16) vzero, vunused);
1977 
1978  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
1979  vfilter32_l = vec_unpackh(vfilter);
1980  vfilter32_r = vec_unpackl(vfilter);
1981 
1982  vtmp = (vec_s32) vec_mergeh(vin, (vec_s16) vzero);
1983  vtmp2 = (vec_s32) vec_mergel(vin, (vec_s16) vzero);
1984 
1985  vtmp = vec_mul(vtmp, vfilter32_l);
1986  vtmp2 = vec_mul(vtmp2, vfilter32_r);
1987 
1988  vout = vec_adds(vout, vtmp);
1989  vout = vec_adds(vout, vtmp2);
1990  }
1991  vout = vec_sums(vout, (vec_s32) vzero);
1992  dst[i] = FFMIN(vout[3] >> sh, (1 << 19) - 1);
1993  }
1994  }
1995 }
1996 
1997 static void hScale16To15_vsx(SwsContext *c, int16_t *dst, int dstW,
1998  const uint8_t *_src, const int16_t *filter,
1999  const int32_t *filterPos, int filterSize)
2000 {
2001  const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
2002  int i, j;
2003  const uint16_t *src = (const uint16_t *) _src;
2004  int sh = desc->comp[0].depth - 1;
2005  vec_s16 vfilter, vin;
2006  vec_s32 vout, vtmp, vtmp2, vfilter32_l, vfilter32_r;
2007  const vec_u8 vzero = vec_splat_u8(0);
2008  const vec_u8 vunusedtab[8] = {
2009  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2010  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
2011  (vec_u8) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
2012  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2013  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
2014  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2015  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
2016  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2017  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2018  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2019  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2020  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2021  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2022  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
2023  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2024  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
2025  };
2026  const vec_u8 vunused = vunusedtab[filterSize % 8];
2027 
2028  if (sh<15) {
2029  sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 : (desc->comp[0].depth - 1);
2030  } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
2031  sh = 16 - 1;
2032  }
2033 
2034  if (filterSize == 1) {
2035  for (i = 0; i < dstW; i++) {
2036  int srcPos = filterPos[i];
2037  int val = 0;
2038 
2039  for (j = 0; j < filterSize; j++) {
2040  val += src[srcPos + j] * filter[filterSize * i + j];
2041  }
2042  // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
2043  dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
2044  }
2045  } else {
2046  for (i = 0; i < dstW; i++) {
2047  const int srcPos = filterPos[i];
2048  vout = vec_splat_s32(0);
2049  for (j = 0; j < filterSize; j += 8) {
2050  vin = (vec_s16) vec_vsx_ld(0, &src[srcPos + j]);
2051  if (j + 8 > filterSize) // Remove the unused elements on the last round
2052  vin = vec_perm(vin, (vec_s16) vzero, vunused);
2053 
2054  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
2055  vfilter32_l = vec_unpackh(vfilter);
2056  vfilter32_r = vec_unpackl(vfilter);
2057 
2058  vtmp = (vec_s32) vec_mergeh(vin, (vec_s16) vzero);
2059  vtmp2 = (vec_s32) vec_mergel(vin, (vec_s16) vzero);
2060 
2061  vtmp = vec_mul(vtmp, vfilter32_l);
2062  vtmp2 = vec_mul(vtmp2, vfilter32_r);
2063 
2064  vout = vec_adds(vout, vtmp);
2065  vout = vec_adds(vout, vtmp2);
2066  }
2067  vout = vec_sums(vout, (vec_s32) vzero);
2068  dst[i] = FFMIN(vout[3] >> sh, (1 << 15) - 1);
2069  }
2070  }
2071 }
2072 
2073 #endif /* !HAVE_BIGENDIAN */
2074 
2075 #endif /* HAVE_VSX */
2076 
2078 {
2079 #if HAVE_VSX
2080  enum AVPixelFormat dstFormat = c->dstFormat;
2081  const int cpu_flags = av_get_cpu_flags();
2082  const unsigned char power8 = HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8;
2083 
2084  if (!(cpu_flags & AV_CPU_FLAG_VSX))
2085  return;
2086 
2087 #if !HAVE_BIGENDIAN
2088  if (c->srcBpc == 8) {
2089  if (c->dstBpc <= 14) {
2090  c->hyScale = c->hcScale = hScale_real_vsx;
2091  if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW >= c->chrSrcW) {
2092  c->hyscale_fast = hyscale_fast_vsx;
2093  c->hcscale_fast = hcscale_fast_vsx;
2094  }
2095  } else {
2096  c->hyScale = c->hcScale = hScale8To19_vsx;
2097  }
2098  } else {
2099  if (power8) {
2100  c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_vsx
2101  : hScale16To15_vsx;
2102  }
2103  }
2104  if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) &&
2105  dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&
2106  !c->needAlpha) {
2107  c->yuv2planeX = yuv2planeX_vsx;
2108  }
2109 #endif
2110 
2111  if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
2112  switch (c->dstBpc) {
2113  case 8:
2114  c->yuv2plane1 = yuv2plane1_8_vsx;
2115  break;
2116 #if !HAVE_BIGENDIAN
2117  case 9:
2118  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_vsx : yuv2plane1_9LE_vsx;
2119  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_vsx : yuv2planeX_9LE_vsx;
2120  break;
2121  case 10:
2122  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_vsx : yuv2plane1_10LE_vsx;
2123  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_vsx : yuv2planeX_10LE_vsx;
2124  break;
2125  case 12:
2126  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_vsx : yuv2plane1_12LE_vsx;
2127  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_vsx : yuv2planeX_12LE_vsx;
2128  break;
2129  case 14:
2130  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx : yuv2plane1_14LE_vsx;
2131  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_vsx : yuv2planeX_14LE_vsx;
2132  break;
2133  case 16:
2134  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx : yuv2plane1_16LE_vsx;
2135 #if HAVE_POWER8
2136  if (cpu_flags & AV_CPU_FLAG_POWER8) {
2137  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_vsx : yuv2planeX_16LE_vsx;
2138  }
2139 #endif /* HAVE_POWER8 */
2140  break;
2141 #endif /* !HAVE_BIGENDIAN */
2142  }
2143  }
2144 
2145  if (c->flags & SWS_BITEXACT)
2146  return;
2147 
2148 #if !HAVE_BIGENDIAN
2149  if (c->flags & SWS_FULL_CHR_H_INT) {
2150  switch (dstFormat) {
2151  case AV_PIX_FMT_RGB24:
2152  if (power8) {
2153  c->yuv2packed1 = yuv2rgb24_full_1_vsx;
2154  c->yuv2packed2 = yuv2rgb24_full_2_vsx;
2155  c->yuv2packedX = yuv2rgb24_full_X_vsx;
2156  }
2157  break;
2158  case AV_PIX_FMT_BGR24:
2159  if (power8) {
2160  c->yuv2packed1 = yuv2bgr24_full_1_vsx;
2161  c->yuv2packed2 = yuv2bgr24_full_2_vsx;
2162  c->yuv2packedX = yuv2bgr24_full_X_vsx;
2163  }
2164  break;
2165  case AV_PIX_FMT_BGRA:
2166  if (power8) {
2167  if (!c->needAlpha) {
2168  c->yuv2packed1 = yuv2bgrx32_full_1_vsx;
2169  c->yuv2packed2 = yuv2bgrx32_full_2_vsx;
2170  c->yuv2packedX = yuv2bgrx32_full_X_vsx;
2171  }
2172  }
2173  break;
2174  case AV_PIX_FMT_RGBA:
2175  if (power8) {
2176  if (!c->needAlpha) {
2177  c->yuv2packed1 = yuv2rgbx32_full_1_vsx;
2178  c->yuv2packed2 = yuv2rgbx32_full_2_vsx;
2179  c->yuv2packedX = yuv2rgbx32_full_X_vsx;
2180  }
2181  }
2182  break;
2183  case AV_PIX_FMT_ARGB:
2184  if (power8) {
2185  if (!c->needAlpha) {
2186  c->yuv2packed1 = yuv2xrgb32_full_1_vsx;
2187  c->yuv2packed2 = yuv2xrgb32_full_2_vsx;
2188  c->yuv2packedX = yuv2xrgb32_full_X_vsx;
2189  }
2190  }
2191  break;
2192  case AV_PIX_FMT_ABGR:
2193  if (power8) {
2194  if (!c->needAlpha) {
2195  c->yuv2packed1 = yuv2xbgr32_full_1_vsx;
2196  c->yuv2packed2 = yuv2xbgr32_full_2_vsx;
2197  c->yuv2packedX = yuv2xbgr32_full_X_vsx;
2198  }
2199  }
2200  break;
2201  }
2202  } else { /* !SWS_FULL_CHR_H_INT */
2203  switch (dstFormat) {
2204  case AV_PIX_FMT_YUYV422:
2205  c->yuv2packed1 = yuv2yuyv422_1_vsx;
2206  c->yuv2packed2 = yuv2yuyv422_2_vsx;
2207  c->yuv2packedX = yuv2yuyv422_X_vsx;
2208  break;
2209  case AV_PIX_FMT_YVYU422:
2210  c->yuv2packed1 = yuv2yvyu422_1_vsx;
2211  c->yuv2packed2 = yuv2yvyu422_2_vsx;
2212  c->yuv2packedX = yuv2yvyu422_X_vsx;
2213  break;
2214  case AV_PIX_FMT_UYVY422:
2215  c->yuv2packed1 = yuv2uyvy422_1_vsx;
2216  c->yuv2packed2 = yuv2uyvy422_2_vsx;
2217  c->yuv2packedX = yuv2uyvy422_X_vsx;
2218  break;
2219  case AV_PIX_FMT_BGRA:
2220  if (power8) {
2221  if (!c->needAlpha) {
2222  c->yuv2packed1 = yuv2bgrx32_1_vsx;
2223  c->yuv2packed2 = yuv2bgrx32_2_vsx;
2224  }
2225  }
2226  break;
2227  case AV_PIX_FMT_RGBA:
2228  if (power8) {
2229  if (!c->needAlpha) {
2230  c->yuv2packed1 = yuv2rgbx32_1_vsx;
2231  c->yuv2packed2 = yuv2rgbx32_2_vsx;
2232  }
2233  }
2234  break;
2235  case AV_PIX_FMT_ARGB:
2236  if (power8) {
2237  if (!c->needAlpha) {
2238  c->yuv2packed1 = yuv2xrgb32_1_vsx;
2239  c->yuv2packed2 = yuv2xrgb32_2_vsx;
2240  }
2241  }
2242  break;
2243  case AV_PIX_FMT_ABGR:
2244  if (power8) {
2245  if (!c->needAlpha) {
2246  c->yuv2packed1 = yuv2xbgr32_1_vsx;
2247  c->yuv2packed2 = yuv2xbgr32_2_vsx;
2248  }
2249  }
2250  break;
2251  case AV_PIX_FMT_RGB24:
2252  if (power8) {
2253  c->yuv2packed1 = yuv2rgb24_1_vsx;
2254  c->yuv2packed2 = yuv2rgb24_2_vsx;
2255  }
2256  break;
2257  case AV_PIX_FMT_BGR24:
2258  if (power8) {
2259  c->yuv2packed1 = yuv2bgr24_1_vsx;
2260  c->yuv2packed2 = yuv2bgr24_2_vsx;
2261  }
2262  break;
2263  }
2264  }
2265 #endif /* !HAVE_BIGENDIAN */
2266 
2267 #endif /* HAVE_VSX */
2268 }
static double val(void *priv, double ch)
Definition: aeval.c:76
#define MAX_FILTER_SIZE
Definition: af_dynaudnorm.c:33
#define A(x)
Definition: vp56_arith.h:28
Macro definitions for various function/variable attributes.
#define av_always_inline
Definition: attributes.h:45
#define av_cold
Definition: attributes.h:88
uint8_t
int32_t
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
static const uint8_t shifts[2][12]
Definition: camellia.c:174
static av_always_inline void filter(int16_t *output, ptrdiff_t out_stride, const int16_t *low, ptrdiff_t low_stride, const int16_t *high, ptrdiff_t high_stride, int len, int clip)
Definition: cfhddsp.c:27
#define FFMIN(a, b)
Definition: common.h:105
#define av_clip_uint8
Definition: common.h:128
#define HAVE_POWER8
Definition: config.h:53
#define NULL
Definition: coverity.c:32
static atomic_int cpu_flags
Definition: cpu.c:50
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:95
#define AV_CPU_FLAG_VSX
ISA 2.06.
Definition: cpu.h:61
#define AV_CPU_FLAG_POWER8
ISA 2.07.
Definition: cpu.h:62
static float add(float src0, float src1)
int
#define SWS_BITEXACT
Definition: swscale.h:84
#define SWS_FULL_CHR_H_INT
Definition: swscale.h:79
#define SWS_FAST_BILINEAR
Definition: swscale.h:58
int i
Definition: input.c:407
static const int shift2[6]
Definition: dxa.c:51
const char * desc
Definition: libsvtav1.c:79
#define LOCAL_ALIGNED(a, t, v,...)
Definition: mem_internal.h:113
#define output_pixel(pos, val, bias, signedness)
Definition: output.c:889
#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)
Definition: output.c:1797
#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)
Definition: output.c:1821
#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)
Definition: output.c:1810
yuv2NBPS(yuv2NBPS(9, yuv2NBPS(BE, yuv2NBPS(1, yuv2NBPS(10, int16_t)
Definition: output.c:371
#define YUV2PACKEDWRAPPER(name, base, ext, fmt)
Definition: output.c:711
const AVPixFmtDescriptor * av_pix_fmt_desc_get(enum AVPixelFormat pix_fmt)
Definition: pixdesc.c:2573
#define AV_PIX_FMT_FLAG_FLOAT
The pixel format contains IEEE-754 floating point values.
Definition: pixdesc.h:190
AVPixelFormat
Pixel format.
Definition: pixfmt.h:64
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:68
@ AV_PIX_FMT_GRAYF32LE
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:341
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:92
@ AV_PIX_FMT_YVYU422
packed YUV 4:2:2, 16bpp, Y0 Cr Y1 Cb
Definition: pixfmt.h:210
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:95
@ AV_PIX_FMT_UYVY422
packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
Definition: pixfmt.h:81
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:94
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:93
@ AV_PIX_FMT_GRAYF32BE
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:340
@ AV_PIX_FMT_YUYV422
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
Definition: pixfmt.h:67
@ AV_PIX_FMT_PAL8
8 bits with AV_PIX_FMT_RGB32 palette
Definition: pixfmt.h:77
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:69
@ SETUP
Definition: rtspcodes.h:133
static int shift(int a, int b)
Definition: sonic.c:82
Descriptor that unambiguously describes how the bits of a pixel are stored in the up to 4 data planes...
Definition: pixdesc.h:81
Definition: rpzaenc.c:58
external API header
static av_always_inline int isAnyRGB(enum AVPixelFormat pix_fmt)
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
static av_always_inline int isBE(enum AVPixelFormat pix_fmt)
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
Definition: swscale_vsx.c:2077
static uint8_t tmp[11]
Definition: aes_ctr.c:27
#define src1
Definition: h264pred.c:140
#define src
Definition: vp8dsp.c:255
#define mul8(a, b)
Contains misc utility macros and inline functions.
#define vec_s16
Definition: util_altivec.h:37
#define vec_s8
Definition: util_altivec.h:35
#define vec_u8
Definition: util_altivec.h:34
#define vec_u32
Definition: util_altivec.h:38
#define vec_u16
Definition: util_altivec.h:36
#define vec_s32
Definition: util_altivec.h:39
static const uint8_t dither[8][8]
Definition: vf_fspp.c:59
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
Definition: vf_lut.c:162
static const uint8_t offset[127][2]
Definition: vf_spp.c:107
uint8_t bits
Definition: vp3data.h:141
static double c[64]