| File: | var/lib/jenkins/workspace/firefox-scan-build/gfx/2d/ssse3-scaler.c |
| Warning: | line 293, column 34 Dereference of null pointer |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | /* | |||
| 2 | * Copyright © 2013 Soren Sandmann Pedersen | |||
| 3 | * Copyright © 2013 Red Hat, Inc. | |||
| 4 | * Copyright © 2016 Mozilla Foundation | |||
| 5 | * | |||
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a | |||
| 7 | * copy of this software and associated documentation files (the "Software"), | |||
| 8 | * to deal in the Software without restriction, including without limitation | |||
| 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |||
| 10 | * and/or sell copies of the Software, and to permit persons to whom the | |||
| 11 | * Software is furnished to do so, subject to the following conditions: | |||
| 12 | * | |||
| 13 | * The above copyright notice and this permission notice (including the next | |||
| 14 | * paragraph) shall be included in all copies or substantial portions of the | |||
| 15 | * Software. | |||
| 16 | * | |||
| 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
| 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
| 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |||
| 20 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
| 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |||
| 22 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |||
| 23 | * DEALINGS IN THE SOFTWARE. | |||
| 24 | * | |||
| 25 | * Author: Soren Sandmann (soren.sandmann@gmail.com) | |||
| 26 | * Jeff Muizelaar (jmuizelaar@mozilla.com) | |||
| 27 | */ | |||
| 28 | ||||
| 29 | /* This has been adapted from the ssse3 code from pixman. It's currently | |||
| 30 | * a mess as I want to try it out in practice before finalizing the details. | |||
| 31 | */ | |||
| 32 | ||||
| 33 | #include <stdlib.h> | |||
| 34 | #include <xmmintrin.h> | |||
| 35 | #include <emmintrin.h> | |||
| 36 | #include <tmmintrin.h> | |||
| 37 | #include <stdint.h> | |||
| 38 | #include <assert.h> | |||
| 39 | #include "ssse3-scaler.h" | |||
| 40 | ||||
| 41 | typedef int32_t pixman_fixed_16_16_t; | |||
| 42 | typedef pixman_fixed_16_16_t pixman_fixed_t; | |||
| 43 | #define pixman_fixed_1(((pixman_fixed_t)((1) << 16))) (pixman_int_to_fixed(1)((pixman_fixed_t)((1) << 16))) | |||
| 44 | #define pixman_fixed_to_int(f)((int)((f) >> 16)) ((int)((f) >> 16)) | |||
| 45 | #define pixman_int_to_fixed(i)((pixman_fixed_t)((i) << 16)) ((pixman_fixed_t)((i) << 16)) | |||
| 46 | #define pixman_double_to_fixed(d)((pixman_fixed_t)((d) * 65536.0)) ((pixman_fixed_t)((d) * 65536.0)) | |||
| 47 | #define PIXMAN_FIXED_INT_MAX32767 32767 | |||
| 48 | #define PIXMAN_FIXED_INT_MIN-32768 -32768 | |||
| 49 | typedef struct pixman_vector pixman_vector_t; | |||
| 50 | ||||
| 51 | typedef int pixman_bool_t; | |||
| 52 | typedef int64_t pixman_fixed_32_32_t; | |||
| 53 | typedef pixman_fixed_32_32_t pixman_fixed_48_16_t; | |||
| 54 | typedef struct { | |||
| 55 | pixman_fixed_48_16_t v[3]; | |||
| 56 | } pixman_vector_48_16_t; | |||
| 57 | ||||
| 58 | struct pixman_vector { | |||
| 59 | pixman_fixed_t vector[3]; | |||
| 60 | }; | |||
| 61 | typedef struct pixman_transform pixman_transform_t; | |||
| 62 | ||||
| 63 | struct pixman_transform { | |||
| 64 | pixman_fixed_t matrix[3][3]; | |||
| 65 | }; | |||
| 66 | ||||
| 67 | #ifdef _MSC_VER | |||
| 68 | # define force_inline__inline__ __attribute__((always_inline)) __forceinline | |||
| 69 | #else | |||
| 70 | # define force_inline__inline__ __attribute__((always_inline)) __inline__ __attribute__((always_inline)) | |||
| 71 | #endif | |||
| 72 | ||||
| 73 | #define BILINEAR_INTERPOLATION_BITS6 6 | |||
| 74 | ||||
| 75 | static force_inline__inline__ __attribute__((always_inline)) int pixman_fixed_to_bilinear_weight(pixman_fixed_t x) { | |||
| 76 | return (x >> (16 - BILINEAR_INTERPOLATION_BITS6)) & | |||
| 77 | ((1 << BILINEAR_INTERPOLATION_BITS6) - 1); | |||
| 78 | } | |||
| 79 | ||||
| 80 | static void pixman_transform_point_31_16_3d(const pixman_transform_t* t, | |||
| 81 | const pixman_vector_48_16_t* v, | |||
| 82 | pixman_vector_48_16_t* result) { | |||
| 83 | int i; | |||
| 84 | int64_t tmp[3][2]; | |||
| 85 | ||||
| 86 | /* input vector values must have no more than 31 bits (including sign) | |||
| 87 | * in the integer part */ | |||
| 88 | assert(v->v[0] < ((pixman_fixed_48_16_t)1 << (30 + 16)))((void) sizeof ((v->v[0] < ((pixman_fixed_48_16_t)1 << (30 + 16))) ? 1 : 0), __extension__ ({ if (v->v[0] < ( (pixman_fixed_48_16_t)1 << (30 + 16))) ; else __assert_fail ("v->v[0] < ((pixman_fixed_48_16_t)1 << (30 + 16))" , "/var/lib/jenkins/workspace/firefox-scan-build/gfx/2d/ssse3-scaler.c" , 88, __extension__ __PRETTY_FUNCTION__); })); | |||
| 89 | assert(v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)))((void) sizeof ((v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16))) ? 1 : 0), __extension__ ({ if (v->v[0] >= - ((pixman_fixed_48_16_t)1 << (30 + 16))) ; else __assert_fail ("v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16))" , "/var/lib/jenkins/workspace/firefox-scan-build/gfx/2d/ssse3-scaler.c" , 89, __extension__ __PRETTY_FUNCTION__); })); | |||
| 90 | assert(v->v[1] < ((pixman_fixed_48_16_t)1 << (30 + 16)))((void) sizeof ((v->v[1] < ((pixman_fixed_48_16_t)1 << (30 + 16))) ? 1 : 0), __extension__ ({ if (v->v[1] < ( (pixman_fixed_48_16_t)1 << (30 + 16))) ; else __assert_fail ("v->v[1] < ((pixman_fixed_48_16_t)1 << (30 + 16))" , "/var/lib/jenkins/workspace/firefox-scan-build/gfx/2d/ssse3-scaler.c" , 90, __extension__ __PRETTY_FUNCTION__); })); | |||
| 91 | assert(v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)))((void) sizeof ((v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16))) ? 1 : 0), __extension__ ({ if (v->v[1] >= - ((pixman_fixed_48_16_t)1 << (30 + 16))) ; else __assert_fail ("v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16))" , "/var/lib/jenkins/workspace/firefox-scan-build/gfx/2d/ssse3-scaler.c" , 91, __extension__ __PRETTY_FUNCTION__); })); | |||
| 92 | assert(v->v[2] < ((pixman_fixed_48_16_t)1 << (30 + 16)))((void) sizeof ((v->v[2] < ((pixman_fixed_48_16_t)1 << (30 + 16))) ? 1 : 0), __extension__ ({ if (v->v[2] < ( (pixman_fixed_48_16_t)1 << (30 + 16))) ; else __assert_fail ("v->v[2] < ((pixman_fixed_48_16_t)1 << (30 + 16))" , "/var/lib/jenkins/workspace/firefox-scan-build/gfx/2d/ssse3-scaler.c" , 92, __extension__ __PRETTY_FUNCTION__); })); | |||
| 93 | assert(v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)))((void) sizeof ((v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16))) ? 1 : 0), __extension__ ({ if (v->v[2] >= - ((pixman_fixed_48_16_t)1 << (30 + 16))) ; else __assert_fail ("v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16))" , "/var/lib/jenkins/workspace/firefox-scan-build/gfx/2d/ssse3-scaler.c" , 93, __extension__ __PRETTY_FUNCTION__); })); | |||
| 94 | ||||
| 95 | for (i = 0; i < 3; i++) { | |||
| 96 | tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16); | |||
| 97 | tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF); | |||
| 98 | tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16); | |||
| 99 | tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF); | |||
| 100 | tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16); | |||
| 101 | tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF); | |||
| 102 | } | |||
| 103 | ||||
| 104 | result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16); | |||
| 105 | result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16); | |||
| 106 | result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16); | |||
| 107 | } | |||
| 108 | ||||
| 109 | static pixman_bool_t pixman_transform_point_3d( | |||
| 110 | const struct pixman_transform* transform, struct pixman_vector* vector) { | |||
| 111 | pixman_vector_48_16_t tmp; | |||
| 112 | tmp.v[0] = vector->vector[0]; | |||
| 113 | tmp.v[1] = vector->vector[1]; | |||
| 114 | tmp.v[2] = vector->vector[2]; | |||
| 115 | ||||
| 116 | pixman_transform_point_31_16_3d(transform, &tmp, &tmp); | |||
| 117 | ||||
| 118 | vector->vector[0] = tmp.v[0]; | |||
| 119 | vector->vector[1] = tmp.v[1]; | |||
| 120 | vector->vector[2] = tmp.v[2]; | |||
| 121 | ||||
| 122 | return vector->vector[0] == tmp.v[0] && vector->vector[1] == tmp.v[1] && | |||
| 123 | vector->vector[2] == tmp.v[2]; | |||
| 124 | } | |||
| 125 | ||||
| 126 | struct bits_image_t { | |||
| 127 | uint32_t* bits; | |||
| 128 | int rowstride; | |||
| 129 | pixman_transform_t* transform; | |||
| 130 | }; | |||
| 131 | ||||
| 132 | typedef struct bits_image_t bits_image_t; | |||
| 133 | typedef struct { | |||
| 134 | int unused; | |||
| 135 | } pixman_iter_info_t; | |||
| 136 | ||||
| 137 | typedef struct pixman_iter_t pixman_iter_t; | |||
| 138 | typedef void (*pixman_iter_fini_t)(pixman_iter_t* iter); | |||
| 139 | ||||
| 140 | struct pixman_iter_t { | |||
| 141 | int x, y; | |||
| 142 | pixman_iter_fini_t fini; | |||
| 143 | bits_image_t* image; | |||
| 144 | uint32_t* buffer; | |||
| 145 | int width; | |||
| 146 | int height; | |||
| 147 | void* data; | |||
| 148 | }; | |||
| 149 | ||||
| 150 | typedef struct { | |||
| 151 | int y; | |||
| 152 | uint64_t* buffer; | |||
| 153 | } line_t; | |||
| 154 | ||||
| 155 | typedef struct { | |||
| 156 | line_t lines[2]; | |||
| 157 | pixman_fixed_t y; | |||
| 158 | pixman_fixed_t x; | |||
| 159 | uint64_t data[1]; | |||
| 160 | } bilinear_info_t; | |||
| 161 | ||||
| 162 | static void ssse3_fetch_horizontal(bits_image_t* image, line_t* line, int y, | |||
| 163 | pixman_fixed_t x, pixman_fixed_t ux, int n) { | |||
| 164 | uint32_t* bits = image->bits + y * image->rowstride; | |||
| 165 | __m128i vx = _mm_set_epi16(-(x + 1), x, -(x + 1), x, -(x + ux + 1), x + ux, | |||
| 166 | -(x + ux + 1), x + ux); | |||
| 167 | __m128i vux = _mm_set_epi16(-2 * ux, 2 * ux, -2 * ux, 2 * ux, -2 * ux, 2 * ux, | |||
| 168 | -2 * ux, 2 * ux); | |||
| 169 | __m128i vaddc = _mm_set_epi16(1, 0, 1, 0, 1, 0, 1, 0); | |||
| 170 | __m128i* b = (__m128i*)line->buffer; | |||
| 171 | __m128i vrl0, vrl1; | |||
| 172 | ||||
| 173 | while ((n -= 2) >= 0) { | |||
| 174 | __m128i vw, vr, s; | |||
| 175 | #ifdef HACKY_PADDING | |||
| 176 | if (pixman_fixed_to_int(x + ux)((int)((x + ux) >> 16)) >= image->rowstride) { | |||
| 177 | vrl1 = _mm_setzero_si128(); | |||
| 178 | printf("overread 2loop\n"); | |||
| 179 | } else { | |||
| 180 | if (pixman_fixed_to_int(x + ux)((int)((x + ux) >> 16)) < 0) printf("underflow\n"); | |||
| 181 | vrl1 = _mm_loadl_epi64( | |||
| 182 | (__m128i*)(bits + (pixman_fixed_to_int(x + ux)((int)((x + ux) >> 16)) < 0 | |||
| 183 | ? 0 | |||
| 184 | : pixman_fixed_to_int(x + ux)((int)((x + ux) >> 16))))); | |||
| 185 | } | |||
| 186 | #else | |||
| 187 | vrl1 = _mm_loadl_epi64((__m128i*)(bits + pixman_fixed_to_int(x + ux)((int)((x + ux) >> 16)))); | |||
| 188 | #endif | |||
| 189 | /* vrl1: R1, L1 */ | |||
| 190 | ||||
| 191 | final_pixel: | |||
| 192 | #ifdef HACKY_PADDING | |||
| 193 | vrl0 = _mm_loadl_epi64( | |||
| 194 | (__m128i*)(bits + | |||
| 195 | (pixman_fixed_to_int(x)((int)((x) >> 16)) < 0 ? 0 : pixman_fixed_to_int(x)((int)((x) >> 16))))); | |||
| 196 | #else | |||
| 197 | vrl0 = _mm_loadl_epi64((__m128i*)(bits + pixman_fixed_to_int(x)((int)((x) >> 16)))); | |||
| 198 | #endif | |||
| 199 | /* vrl0: R0, L0 */ | |||
| 200 | ||||
| 201 | /* The weights are based on vx which is a vector of | |||
| 202 | * | |||
| 203 | * - (x + 1), x, - (x + 1), x, | |||
| 204 | * - (x + ux + 1), x + ux, - (x + ux + 1), x + ux | |||
| 205 | * | |||
| 206 | * so the 16 bit weights end up like this: | |||
| 207 | * | |||
| 208 | * iw0, w0, iw0, w0, iw1, w1, iw1, w1 | |||
| 209 | * | |||
| 210 | * and after shifting and packing, we get these bytes: | |||
| 211 | * | |||
| 212 | * iw0, w0, iw0, w0, iw1, w1, iw1, w1, | |||
| 213 | * iw0, w0, iw0, w0, iw1, w1, iw1, w1, | |||
| 214 | * | |||
| 215 | * which means the first and the second input pixel | |||
| 216 | * have to be interleaved like this: | |||
| 217 | * | |||
| 218 | * la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, | |||
| 219 | * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 | |||
| 220 | * | |||
| 221 | * before maddubsw can be used. | |||
| 222 | */ | |||
| 223 | ||||
| 224 | vw = _mm_add_epi16(vaddc, | |||
| 225 | _mm_srli_epi16(vx, 16 - BILINEAR_INTERPOLATION_BITS6)); | |||
| 226 | /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1 | |||
| 227 | */ | |||
| 228 | ||||
| 229 | vw = _mm_packus_epi16(vw, vw); | |||
| 230 | /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1, | |||
| 231 | * iw0, w0, iw0, w0, iw1, w1, iw1, w1 | |||
| 232 | */ | |||
| 233 | vx = _mm_add_epi16(vx, vux); | |||
| 234 | ||||
| 235 | x += 2 * ux; | |||
| 236 | ||||
| 237 | vr = _mm_unpacklo_epi16(vrl1, vrl0); | |||
| 238 | /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */ | |||
| 239 | ||||
| 240 | s = _mm_shuffle_epi32(vr, _MM_SHUFFLE(1, 0, 3, 2))((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(vr), (int)( (((1) << 6) | ((0) << 4) | ((3) << 2) | (2) )))); | |||
| 241 | /* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */ | |||
| 242 | ||||
| 243 | vr = _mm_unpackhi_epi8(vr, s); | |||
| 244 | /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, | |||
| 245 | * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 | |||
| 246 | */ | |||
| 247 | ||||
| 248 | vr = _mm_maddubs_epi16(vr, vw); | |||
| 249 | ||||
| 250 | /* When the weight is 0, the inverse weight is | |||
| 251 | * 128 which can't be represented in a signed byte. | |||
| 252 | * As a result maddubsw computes the following: | |||
| 253 | * | |||
| 254 | * r = l * -128 + r * 0 | |||
| 255 | * | |||
| 256 | * rather than the desired | |||
| 257 | * | |||
| 258 | * r = l * 128 + r * 0 | |||
| 259 | * | |||
| 260 | * We fix this by taking the absolute value of the | |||
| 261 | * result. | |||
| 262 | */ | |||
| 263 | // we can drop this if we use lower precision | |||
| 264 | ||||
| 265 | vr = _mm_shuffle_epi32(vr, _MM_SHUFFLE(2, 0, 3, 1))((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(vr), (int)( (((2) << 6) | ((0) << 4) | ((3) << 2) | (1) )))); | |||
| 266 | /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */ | |||
| 267 | _mm_store_si128(b++, vr); | |||
| 268 | } | |||
| 269 | ||||
| 270 | if (n == -1) { | |||
| 271 | vrl1 = _mm_setzero_si128(); | |||
| 272 | goto final_pixel; | |||
| 273 | } | |||
| 274 | ||||
| 275 | line->y = y; | |||
| 276 | } | |||
| 277 | ||||
| 278 | // scale a line of destination pixels | |||
| 279 | static uint32_t* ssse3_fetch_bilinear_cover(pixman_iter_t* iter, | |||
| 280 | const uint32_t* mask) { | |||
| 281 | pixman_fixed_t fx, ux; | |||
| 282 | bilinear_info_t* info = iter->data; | |||
| 283 | line_t *line0, *line1; | |||
| 284 | int y0, y1; | |||
| 285 | int32_t dist_y; | |||
| 286 | __m128i vw, uvw; | |||
| 287 | int i; | |||
| 288 | ||||
| 289 | fx = info->x; | |||
| 290 | ux = iter->image->transform->matrix[0][0]; | |||
| 291 | ||||
| 292 | y0 = pixman_fixed_to_int(info->y)((int)((info->y) >> 16)); | |||
| 293 | if (y0 < 0) *(volatile char*)0 = 9; | |||
| ||||
| 294 | y1 = y0 + 1; | |||
| 295 | ||||
| 296 | // clamping in y direction | |||
| 297 | if (y1 >= iter->height) { | |||
| 298 | y1 = iter->height - 1; | |||
| 299 | } | |||
| 300 | ||||
| 301 | line0 = &info->lines[y0 & 0x01]; | |||
| 302 | line1 = &info->lines[y1 & 0x01]; | |||
| 303 | ||||
| 304 | if (line0->y != y0) { | |||
| 305 | ssse3_fetch_horizontal(iter->image, line0, y0, fx, ux, iter->width); | |||
| 306 | } | |||
| 307 | ||||
| 308 | if (line1->y != y1) { | |||
| 309 | ssse3_fetch_horizontal(iter->image, line1, y1, fx, ux, iter->width); | |||
| 310 | } | |||
| 311 | ||||
| 312 | #ifdef PIXMAN_STYLE_INTERPOLATION | |||
| 313 | dist_y = pixman_fixed_to_bilinear_weight(info->y); | |||
| 314 | dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS6); | |||
| 315 | ||||
| 316 | vw = _mm_set_epi16(dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, | |||
| 317 | dist_y); | |||
| 318 | ||||
| 319 | #else | |||
| 320 | // setup the weights for the top (vw) and bottom (uvw) lines | |||
| 321 | dist_y = pixman_fixed_to_bilinear_weight(info->y); | |||
| 322 | // we use 15 instead of 16 because we need an extra bit to handle when the | |||
| 323 | // weights are 0 and 1 | |||
| 324 | dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS6); | |||
| 325 | ||||
| 326 | vw = _mm_set_epi16(dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, | |||
| 327 | dist_y); | |||
| 328 | ||||
| 329 | dist_y = (1 << BILINEAR_INTERPOLATION_BITS6) - | |||
| 330 | pixman_fixed_to_bilinear_weight(info->y); | |||
| 331 | dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS6); | |||
| 332 | uvw = _mm_set_epi16(dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, | |||
| 333 | dist_y); | |||
| 334 | #endif | |||
| 335 | ||||
| 336 | for (i = 0; i + 3 < iter->width; i += 4) { | |||
| 337 | __m128i top0 = _mm_load_si128((__m128i*)(line0->buffer + i)); | |||
| 338 | __m128i bot0 = _mm_load_si128((__m128i*)(line1->buffer + i)); | |||
| 339 | __m128i top1 = _mm_load_si128((__m128i*)(line0->buffer + i + 2)); | |||
| 340 | __m128i bot1 = _mm_load_si128((__m128i*)(line1->buffer + i + 2)); | |||
| 341 | #ifdef PIXMAN_STYLE_INTERPOLATION | |||
| 342 | __m128i r0, r1, tmp, p; | |||
| 343 | ||||
| 344 | r0 = _mm_mulhi_epu16(_mm_sub_epi16(bot0, top0), vw); | |||
| 345 | tmp = _mm_cmplt_epi16(bot0, top0); | |||
| 346 | tmp = _mm_and_si128(tmp, vw); | |||
| 347 | r0 = _mm_sub_epi16(r0, tmp); | |||
| 348 | r0 = _mm_add_epi16(r0, top0); | |||
| 349 | r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS6); | |||
| 350 | /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */ | |||
| 351 | // r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); | |||
| 352 | /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */ | |||
| 353 | ||||
| 354 | // tmp = bot1 < top1 ? vw : 0; | |||
| 355 | // r1 = (bot1 - top1)*vw + top1 - tmp | |||
| 356 | // r1 = bot1*vw - vw*top1 + top1 - tmp | |||
| 357 | // r1 = bot1*vw + top1 - vw*top1 - tmp | |||
| 358 | // r1 = bot1*vw + top1*(1 - vw) - tmp | |||
| 359 | r1 = _mm_mulhi_epu16(_mm_sub_epi16(bot1, top1), vw); | |||
| 360 | tmp = _mm_cmplt_epi16(bot1, top1); | |||
| 361 | tmp = _mm_and_si128(tmp, vw); | |||
| 362 | r1 = _mm_sub_epi16(r1, tmp); | |||
| 363 | r1 = _mm_add_epi16(r1, top1); | |||
| 364 | r1 = _mm_srli_epi16(r1, BILINEAR_INTERPOLATION_BITS6); | |||
| 365 | // r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1)); | |||
| 366 | /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */ | |||
| 367 | #else | |||
| 368 | __m128i r0, r1, p; | |||
| 369 | top0 = _mm_mulhi_epu16(top0, uvw); | |||
| 370 | bot0 = _mm_mulhi_epu16(bot0, vw); | |||
| 371 | r0 = _mm_add_epi16(top0, bot0); | |||
| 372 | r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS6 - 1); | |||
| 373 | ||||
| 374 | top1 = _mm_mulhi_epu16(top1, uvw); | |||
| 375 | bot1 = _mm_mulhi_epu16(bot1, vw); | |||
| 376 | r1 = _mm_add_epi16(top1, bot1); | |||
| 377 | r1 = _mm_srli_epi16(r1, BILINEAR_INTERPOLATION_BITS6 - 1); | |||
| 378 | #endif | |||
| 379 | ||||
| 380 | p = _mm_packus_epi16(r0, r1); | |||
| 381 | _mm_storeu_si128((__m128i*)(iter->buffer + i), p); | |||
| 382 | } | |||
| 383 | ||||
| 384 | while (i < iter->width) { | |||
| 385 | __m128i top0 = _mm_load_si128((__m128i*)(line0->buffer + i)); | |||
| 386 | __m128i bot0 = _mm_load_si128((__m128i*)(line1->buffer + i)); | |||
| 387 | ||||
| 388 | #ifdef PIXMAN_STYLE_INTERPOLATION | |||
| 389 | __m128i r0, tmp, p; | |||
| 390 | r0 = _mm_mulhi_epu16(_mm_sub_epi16(bot0, top0), vw); | |||
| 391 | tmp = _mm_cmplt_epi16(bot0, top0); | |||
| 392 | tmp = _mm_and_si128(tmp, vw); | |||
| 393 | r0 = _mm_sub_epi16(r0, tmp); | |||
| 394 | r0 = _mm_add_epi16(r0, top0); | |||
| 395 | r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS6); | |||
| 396 | /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */ | |||
| 397 | r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2, 0, 3, 1))((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(r0), (int)( (((2) << 6) | ((0) << 4) | ((3) << 2) | (1) )))); | |||
| 398 | /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */ | |||
| 399 | #else | |||
| 400 | __m128i r0, p; | |||
| 401 | top0 = _mm_mulhi_epu16(top0, uvw); | |||
| 402 | bot0 = _mm_mulhi_epu16(bot0, vw); | |||
| 403 | r0 = _mm_add_epi16(top0, bot0); | |||
| 404 | r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS6 - 1); | |||
| 405 | #endif | |||
| 406 | ||||
| 407 | p = _mm_packus_epi16(r0, r0); | |||
| 408 | ||||
| 409 | if (iter->width - i == 1) { | |||
| 410 | *(uint32_t*)(iter->buffer + i) = _mm_cvtsi128_si32(p); | |||
| 411 | i++; | |||
| 412 | } else { | |||
| 413 | _mm_storel_epi64((__m128i*)(iter->buffer + i), p); | |||
| 414 | i += 2; | |||
| 415 | } | |||
| 416 | } | |||
| 417 | ||||
| 418 | info->y += iter->image->transform->matrix[1][1]; | |||
| 419 | ||||
| 420 | return iter->buffer; | |||
| 421 | } | |||
| 422 | ||||
| 423 | static void ssse3_bilinear_cover_iter_fini(pixman_iter_t* iter) { | |||
| 424 | free(iter->data); | |||
| 425 | } | |||
| 426 | ||||
| 427 | static void ssse3_bilinear_cover_iter_init(pixman_iter_t* iter) { | |||
| 428 | int width = iter->width; | |||
| 429 | bilinear_info_t* info; | |||
| 430 | pixman_vector_t v; | |||
| 431 | ||||
| 432 | if (iter->x > PIXMAN_FIXED_INT_MAX32767 || iter->x < PIXMAN_FIXED_INT_MIN-32768 || | |||
| 433 | iter->y > PIXMAN_FIXED_INT_MAX32767 || iter->y < PIXMAN_FIXED_INT_MIN-32768) | |||
| 434 | goto fail; | |||
| 435 | ||||
| 436 | /* Reference point is the center of the pixel */ | |||
| 437 | v.vector[0] = pixman_int_to_fixed(iter->x)((pixman_fixed_t)((iter->x) << 16)) + pixman_fixed_1(((pixman_fixed_t)((1) << 16))) / 2; | |||
| 438 | v.vector[1] = pixman_int_to_fixed(iter->y)((pixman_fixed_t)((iter->y) << 16)) + pixman_fixed_1(((pixman_fixed_t)((1) << 16))) / 2; | |||
| 439 | v.vector[2] = pixman_fixed_1(((pixman_fixed_t)((1) << 16))); | |||
| 440 | ||||
| 441 | if (!pixman_transform_point_3d(iter->image->transform, &v)) goto fail; | |||
| 442 | ||||
| 443 | info = malloc(sizeof(*info) + (2 * width - 1) * sizeof(uint64_t) + 64); | |||
| 444 | if (!info) goto fail; | |||
| 445 | ||||
| 446 | info->x = v.vector[0] - pixman_fixed_1(((pixman_fixed_t)((1) << 16))) / 2; | |||
| 447 | info->y = v.vector[1] - pixman_fixed_1(((pixman_fixed_t)((1) << 16))) / 2; | |||
| 448 | ||||
| 449 | #define ALIGN(addr)((void*)((((uintptr_t)(addr)) + 15) & (~15))) ((void*)((((uintptr_t)(addr)) + 15) & (~15))) | |||
| 450 | ||||
| 451 | /* It is safe to set the y coordinates to -1 initially | |||
| 452 | * because COVER_CLIP_BILINEAR ensures that we will only | |||
| 453 | * be asked to fetch lines in the [0, height) interval | |||
| 454 | */ | |||
| 455 | info->lines[0].y = -1; | |||
| 456 | info->lines[0].buffer = ALIGN(&(info->data[0]))((void*)((((uintptr_t)(&(info->data[0]))) + 15) & ( ~15))); | |||
| 457 | info->lines[1].y = -1; | |||
| 458 | info->lines[1].buffer = ALIGN(info->lines[0].buffer + width)((void*)((((uintptr_t)(info->lines[0].buffer + width)) + 15 ) & (~15))); | |||
| 459 | ||||
| 460 | iter->fini = ssse3_bilinear_cover_iter_fini; | |||
| 461 | ||||
| 462 | iter->data = info; | |||
| 463 | return; | |||
| 464 | ||||
| 465 | fail: | |||
| 466 | /* Something went wrong, either a bad matrix or OOM; in such cases, | |||
| 467 | * we don't guarantee any particular rendering. | |||
| 468 | */ | |||
| 469 | iter->fini = NULL((void*)0); | |||
| 470 | } | |||
| 471 | ||||
| 472 | /* scale the src from src_width/height to dest_width/height drawn | |||
| 473 | * into the rectangle x,y width,height | |||
| 474 | * src_stride and dst_stride are 4 byte units */ | |||
| 475 | bool_Bool ssse3_scale_data(uint32_t* src, int src_width, int src_height, | |||
| 476 | int src_stride, uint32_t* dest, int dest_width, | |||
| 477 | int dest_height, int dest_stride, int x, int y, int width, | |||
| 478 | int height) { | |||
| 479 | // XXX: assert(src_width > 1) | |||
| 480 | pixman_transform_t transform = { | |||
| 481 | {{pixman_fixed_1(((pixman_fixed_t)((1) << 16))), 0, 0}, {0, pixman_fixed_1(((pixman_fixed_t)((1) << 16))), 0}, {0, 0, pixman_fixed_1(((pixman_fixed_t)((1) << 16)))}}}; | |||
| 482 | double width_scale = ((double)src_width) / dest_width; | |||
| 483 | double height_scale = ((double)src_height) / dest_height; | |||
| 484 | #define AVOID_PADDING | |||
| 485 | #ifdef AVOID_PADDING | |||
| 486 | // scale up by enough that we don't read outside of the bounds of the source | |||
| 487 | // surface currently this is required to avoid reading out of bounds. | |||
| 488 | if (width_scale < 1) { | |||
| ||||
| 489 | width_scale = (double)(src_width - 1) / dest_width; | |||
| 490 | transform.matrix[0][2] = pixman_fixed_1(((pixman_fixed_t)((1) << 16))) / 2; | |||
| 491 | } | |||
| 492 | if (height_scale < 1) { | |||
| 493 | height_scale = (double)(src_height - 1) / dest_height; | |||
| 494 | transform.matrix[1][2] = pixman_fixed_1(((pixman_fixed_t)((1) << 16))) / 2; | |||
| 495 | } | |||
| 496 | #endif | |||
| 497 | transform.matrix[0][0] = pixman_double_to_fixed(width_scale)((pixman_fixed_t)((width_scale) * 65536.0)); | |||
| 498 | transform.matrix[1][1] = pixman_double_to_fixed(height_scale)((pixman_fixed_t)((height_scale) * 65536.0)); | |||
| 499 | transform.matrix[2][2] = pixman_fixed_1(((pixman_fixed_t)((1) << 16))); | |||
| 500 | ||||
| 501 | bits_image_t image; | |||
| 502 | image.bits = src; | |||
| 503 | image.transform = &transform; | |||
| 504 | image.rowstride = src_stride; | |||
| 505 | ||||
| 506 | pixman_iter_t iter; | |||
| 507 | iter.image = ℑ | |||
| 508 | iter.x = x; | |||
| 509 | iter.y = y; | |||
| 510 | iter.width = width; | |||
| 511 | iter.height = src_height; | |||
| 512 | iter.buffer = dest; | |||
| 513 | iter.data = NULL((void*)0); | |||
| 514 | ||||
| 515 | ssse3_bilinear_cover_iter_init(&iter); | |||
| 516 | ||||
| 517 | if (!iter.fini
| |||
| 518 | ||||
| 519 | if (iter.data
| |||
| 520 | for (int iy = 0; iy < height; iy++) { | |||
| 521 | ssse3_fetch_bilinear_cover(&iter, NULL((void*)0)); | |||
| 522 | iter.buffer += dest_stride; | |||
| 523 | } | |||
| 524 | ssse3_bilinear_cover_iter_fini(&iter); | |||
| 525 | } | |||
| 526 | return true1; | |||
| 527 | } |