FFmpeg  4.4.5
swscale_ppc_template.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "libavutil/mem_internal.h"
25 
26 static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
27  const int16_t **src, uint8_t *dest,
28  const uint8_t *dither, int offset, int x)
29 {
30  register int i, j;
31  LOCAL_ALIGNED(16, int, val, [16]);
32  vector signed int vo1, vo2, vo3, vo4;
33  vector unsigned short vs1, vs2;
34  vector unsigned char vf;
35  vector unsigned int altivec_vectorShiftInt19 =
36  vec_add(vec_splat_u32(10), vec_splat_u32(9));
37 
38  for (i = 0; i < 16; i++)
39  val[i] = dither[(x + i + offset) & 7] << 12;
40 
41  vo1 = vec_ld(0, val);
42  vo2 = vec_ld(16, val);
43  vo3 = vec_ld(32, val);
44  vo4 = vec_ld(48, val);
45 
46  for (j = 0; j < filterSize; j++) {
47  unsigned int joffset=j<<1;
48  unsigned int xoffset=x<<1;
49  vector unsigned char av_unused perm;
50  vector signed short l1,vLumFilter;
51  LOAD_FILTER(vLumFilter,filter);
52  vLumFilter = vec_splat(vLumFilter, 0);
53  LOAD_L1(l1,src[j],perm);
54  yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
55  yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
56  }
57 
58  vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
59  vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
60  vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
61  vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
62  vs1 = vec_packsu(vo1, vo2);
63  vs2 = vec_packsu(vo3, vo4);
64  vf = vec_packsu(vs1, vs2);
65  VEC_ST(vf, 0, dest);
66 }
67 
68 
69 static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
70  const int16_t **src, uint8_t *dest, int dstW,
71  const uint8_t *dither, int offset, int x)
72 {
73  int i, j;
74 
75  for (i = x; i < dstW; i++) {
76  int t = dither[(i + offset) & 7] << 12;
77  for (j = 0; j < filterSize; j++)
78  t += src[j][i] * filter[j];
79  dest[i] = av_clip_uint8(t >> 19);
80  }
81 }
82 
83 static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize,
84  const int16_t **src, uint8_t *dest, int dstW,
85  const uint8_t *dither, int offset)
86 {
87  int dst_u = -(uintptr_t)dest & 15;
88  int i;
89 
90  yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
91 
92  for (i = dst_u; i < dstW - 15; i += 16)
93  FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
94  offset, i);
95 
96  yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
97 }
98 
99 static void FUNC(hScale_real)(SwsContext *c, int16_t *dst, int dstW,
100  const uint8_t *src, const int16_t *filter,
101  const int32_t *filterPos, int filterSize)
102 {
103  register int i;
104  LOCAL_ALIGNED(16, int, tempo, [4]);
105 
106  if (filterSize % 4) {
107  for (i = 0; i < dstW; i++) {
108  register int j;
109  register int srcPos = filterPos[i];
110  register int val = 0;
111  for (j = 0; j < filterSize; j++)
112  val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
113  dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
114  }
115  } else
116  switch (filterSize) {
117  case 4:
118  for (i = 0; i < dstW; i++) {
119  register int srcPos = filterPos[i];
120 
121  vector unsigned char src_vF = unaligned_load(srcPos, src);
122  vector signed short src_v, filter_v;
123  vector signed int val_vEven, val_s;
124  src_v = // vec_unpackh sign-extends...
125  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
126  // now put our elements in the even slots
127  src_v = vec_mergeh(src_v, (vector signed short)vzero);
128  GET_VF4(i, filter_v, filter);
129  val_vEven = vec_mule(src_v, filter_v);
130  val_s = vec_sums(val_vEven, vzero);
131  vec_st(val_s, 0, tempo);
132  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
133  }
134  break;
135  case 8:
136  for (i = 0; i < dstW; i++) {
137  register int srcPos = filterPos[i];
138  vector unsigned char src_vF, av_unused src_v0, av_unused src_v1;
139  vector unsigned char av_unused permS;
140  vector signed short src_v, filter_v;
141  vector signed int val_v, val_s;
142  FIRST_LOAD(src_v0, srcPos, src, permS);
143  LOAD_SRCV8(srcPos, 0, src, permS, src_v0, src_v1, src_vF);
144  src_v = // vec_unpackh sign-extends...
145  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
146  filter_v = vec_ld(i << 4, filter);
147  val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
148  val_s = vec_sums(val_v, vzero);
149  vec_st(val_s, 0, tempo);
150  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
151  }
152  break;
153 
154  case 16:
155  for (i = 0; i < dstW; i++) {
156  register int srcPos = filterPos[i];
157 
158  vector unsigned char src_vF = unaligned_load(srcPos, src);
159  vector signed short src_vA = // vec_unpackh sign-extends...
160  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
161  vector signed short src_vB = // vec_unpackh sign-extends...
162  (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
163  vector signed short filter_v0 = vec_ld(i << 5, filter);
164  vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
165 
166  vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
167  vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
168 
169  vector signed int val_s = vec_sums(val_v, vzero);
170 
171  VEC_ST(val_s, 0, tempo);
172  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
173  }
174  break;
175 
176  default:
177  for (i = 0; i < dstW; i++) {
178  register int j, av_unused offset = i * 2 * filterSize;
179  register int srcPos = filterPos[i];
180 
181  vector signed int val_s, val_v = (vector signed int)vzero;
182  vector signed short av_unused filter_v0R;
183  vector unsigned char av_unused permF, av_unused src_v0, av_unused permS;
184  FIRST_LOAD(filter_v0R, offset, filter, permF);
185  FIRST_LOAD(src_v0, srcPos, src, permS);
186 
187  for (j = 0; j < filterSize - 15; j += 16) {
188  vector unsigned char av_unused src_v1, src_vF;
189  vector signed short av_unused filter_v1R, av_unused filter_v2R,
190  filter_v0, filter_v1, src_vA, src_vB;
191  vector signed int val_acc;
192  LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
193  src_vA = // vec_unpackh sign-extends...
194  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
195  src_vB = // vec_unpackh sign-extends...
196  (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
197  GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v0, 0);
198  GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, filter_v1, 16);
199 
200  val_acc = vec_msums(src_vA, filter_v0, val_v);
201  val_v = vec_msums(src_vB, filter_v1, val_acc);
202  UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0);
203  }
204 
205  if (j < filterSize - 7) {
206  // loading src_v0 is useless, it's already done above
207  vector unsigned char av_unused src_v1, src_vF;
208  vector signed short src_v, av_unused filter_v1R, filter_v;
209  LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF);
210  src_v = // vec_unpackh sign-extends...
211  (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
212  GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v, 0);
213  val_v = vec_msums(src_v, filter_v, val_v);
214  }
215  val_s = vec_sums(val_v, vzero);
216 
217  VEC_ST(val_s, 0, tempo);
218  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
219  }
220  }
221 }
static double val(void *priv, double ch)
Definition: aeval.c:76
#define av_unused
Definition: attributes.h:131
uint8_t
int32_t
#define FUNC(a)
static av_always_inline void filter(int16_t *output, ptrdiff_t out_stride, const int16_t *low, ptrdiff_t low_stride, const int16_t *high, ptrdiff_t high_stride, int len, int clip)
Definition: cfhddsp.c:27
#define FFMIN(a, b)
Definition: common.h:105
#define av_clip_uint8
Definition: common.h:128
perm
Definition: f_perms.c:74
int
int i
Definition: input.c:407
#define LOCAL_ALIGNED(a, t, v,...)
Definition: mem_internal.h:113
static void FUNC() hScale_real(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
static void FUNC() yuv2planeX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
static void yuv2planeX_u(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset, int x)
static void FUNC() yuv2planeX_8_16(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x)
#define src
Definition: vp8dsp.c:255
static const uint8_t dither[8][8]
Definition: vf_fspp.c:59
static const uint8_t offset[127][2]
Definition: vf_spp.c:107
static double c[64]