00001 #ifdef FCAM_ARCH_ARM
00002 #include "Demosaic_ARM.h"
00003 #include <arm_neon.h>
00004
00005 namespace FCam {
00006
00007
00008 extern void makeLUT(const Frame &f, float contrast, int blackLevel, float gamma, unsigned char *lut);
00009
00010 Image demosaic_ARM(Frame src, float contrast, bool denoise, int blackLevel, float gamma) {
00011
00012 const int BLOCK_WIDTH = 40;
00013 const int BLOCK_HEIGHT = 24;
00014
00015 Image input = src.image();
00016
00017
00018 switch((int)src.bayerPattern()) {
00019 case GRBG:
00020 break;
00021 case RGGB:
00022 input = input.subImage(1, 0, Size(input.width()-2, input.height()));
00023 break;
00024 case BGGR:
00025 input = input.subImage(0, 1, Size(input.width(), input.height()-2));
00026 break;
00027 case GBRG:
00028 input = input.subImage(1, 1, Size(input.width()-2, input.height()-2));
00029 default:
00030 error(Event::DemosaicError, "Can't demosaic from a non-bayer sensor\n");
00031 return Image();
00032 }
00033
00034 int rawWidth = input.width();
00035 int rawHeight = input.height();
00036
00037 const int VEC_WIDTH = ((BLOCK_WIDTH + 8)/8);
00038 const int VEC_HEIGHT = ((BLOCK_HEIGHT + 8)/2);
00039
00040 int rawPixelsPerRow = input.bytesPerRow()/2 ;
00041
00042 int outWidth = rawWidth-8;
00043 int outHeight = rawHeight-8;
00044 outWidth /= BLOCK_WIDTH;
00045 outWidth *= BLOCK_WIDTH;
00046 outHeight /= BLOCK_HEIGHT;
00047 outHeight *= BLOCK_HEIGHT;
00048
00049 Image out(outWidth, outHeight, RGB24);
00050
00051
00052 if (((input.width() - 8) != (unsigned)outWidth) ||
00053 ((input.height() - 8) != (unsigned)outHeight)) {
00054 int offX = (input.width() - 8 - outWidth)/2;
00055 int offY = (input.height() - 8 - outHeight)/2;
00056 offX -= offX&1;
00057 offY -= offY&1;
00058
00059 if (offX || offY) {
00060 input = input.subImage(offX, offY, Size(outWidth+8, outHeight+8));
00061 }
00062 }
00063
00064 Time startTime = Time::now();
00065
00066
00067 float colorMatrix_f[12];
00068
00069 src.rawToRGBColorMatrix((float *)colorMatrix_f);
00070
00071 int16x4_t colorMatrix[3];
00072 for (int i = 0; i < 3; i++) {
00073 int16_t val = (int16_t)(colorMatrix_f[i*4+0] * 256 + 0.5);
00074 colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 0);
00075 val = (int16_t)(colorMatrix_f[i*4+1] * 256 + 0.5);
00076 colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 1);
00077 val = (int16_t)(colorMatrix_f[i*4+2] * 256 + 0.5);
00078 colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 2);
00079 val = (int16_t)(colorMatrix_f[i*4+3] * 256 + 0.5);
00080 colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 3);
00081 }
00082
00083
00084
00085 uint16_t out16[BLOCK_WIDTH*BLOCK_HEIGHT*3];
00086
00087
00088
00089 int16_t scratch[VEC_WIDTH*VEC_HEIGHT*4*12];
00090
00091 #define R_R_OFF (VEC_WIDTH*VEC_HEIGHT*4*0)
00092 #define R_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*1)
00093 #define R_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*2)
00094 #define R_B_OFF (VEC_WIDTH*VEC_HEIGHT*4*3)
00095
00096 #define G_R_OFF (VEC_WIDTH*VEC_HEIGHT*4*4)
00097 #define G_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*5)
00098 #define G_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*6)
00099 #define G_B_OFF (VEC_WIDTH*VEC_HEIGHT*4*7)
00100
00101 #define B_R_OFF (VEC_WIDTH*VEC_HEIGHT*4*8)
00102 #define B_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*9)
00103 #define B_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*10)
00104 #define B_B_OFF (VEC_WIDTH*VEC_HEIGHT*4*11)
00105
00106 #define R_R(i) (scratch+(i)+R_R_OFF)
00107 #define R_GR(i) (scratch+(i)+R_GR_OFF)
00108 #define R_GB(i) (scratch+(i)+R_GB_OFF)
00109 #define R_B(i) (scratch+(i)+R_B_OFF)
00110
00111 #define G_R(i) (scratch+(i)+G_R_OFF)
00112 #define G_GR(i) (scratch+(i)+G_GR_OFF)
00113 #define G_GB(i) (scratch+(i)+G_GB_OFF)
00114 #define G_B(i) (scratch+(i)+G_B_OFF)
00115
00116 #define B_R(i) (scratch+(i)+B_R_OFF)
00117 #define B_GR(i) (scratch+(i)+B_GR_OFF)
00118 #define B_GB(i) (scratch+(i)+B_GB_OFF)
00119 #define B_B(i) (scratch+(i)+B_B_OFF)
00120
00121
00122 #define G_GR_NOISY B_GR
00123 #define B_B_NOISY G_B
00124 #define R_R_NOISY G_R
00125 #define G_GB_NOISY B_GB
00126
00127
00128 unsigned char lut[4096];
00129 makeLUT(src, contrast, blackLevel, gamma, lut);
00130
00131
00132 for (int by = 0; by < rawHeight-8-BLOCK_HEIGHT+1; by += BLOCK_HEIGHT) {
00133 const short * __restrict__ blockPtr = (const short *)input(0,by);
00134 unsigned char * __restrict__ outBlockPtr = out(0, by);
00135 for (int bx = 0; bx < rawWidth-8-BLOCK_WIDTH+1; bx += BLOCK_WIDTH) {
00136
00137
00138 if (1) {
00139 register const int16_t * __restrict__ rawPtr = blockPtr;
00140 register const int16_t * __restrict__ rawPtr2 = blockPtr + rawPixelsPerRow;
00141
00142 register const int rawJump = rawPixelsPerRow*2 - VEC_WIDTH*8;
00143
00144 register int16_t * __restrict__ g_gr_ptr = denoise ? G_GR_NOISY(0) : G_GR(0);
00145 register int16_t * __restrict__ r_r_ptr = denoise ? R_R_NOISY(0) : R_R(0);
00146 register int16_t * __restrict__ b_b_ptr = denoise ? B_B_NOISY(0) : B_B(0);
00147 register int16_t * __restrict__ g_gb_ptr = denoise ? G_GB_NOISY(0) : G_GB(0);
00148
00149 for (int y = 0; y < VEC_HEIGHT; y++) {
00150 for (int x = 0; x < VEC_WIDTH/2; x++) {
00151
00152 asm volatile ("# Stage 1) Demux\n");
00153
00154
00155
00156
00157 asm volatile (
00158 "vld2.16 {d6-d9}, [%[rawPtr]]! \n\t"
00159 "vld2.16 {d10-d13}, [%[rawPtr2]]! \n\t"
00160 "vst1.16 {d6-d7}, [%[g_gr_ptr]]! \n\t"
00161 "vst1.16 {d8-d9}, [%[r_r_ptr]]! \n\t"
00162 "vst1.16 {d10-d11}, [%[b_b_ptr]]! \n\t"
00163 "vst1.16 {d12-d13}, [%[g_gb_ptr]]! \n\t" :
00164 [rawPtr]"+r"(rawPtr),
00165 [rawPtr2]"+r"(rawPtr2),
00166 [g_gr_ptr]"+r"(g_gr_ptr),
00167 [r_r_ptr]"+r"(r_r_ptr),
00168 [b_b_ptr]"+r"(b_b_ptr),
00169 [g_gb_ptr]"+r"(g_gb_ptr) ::
00170 "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "memory");
00171
00172 }
00173
00174 rawPtr += rawJump;
00175 rawPtr2 += rawJump;
00176 }
00177 }
00178
00179
00180
00181
00182
00183 if (denoise) {
00184 register int16_t * __restrict__ ptr_in = NULL;
00185 register int16_t * __restrict__ ptr_out = NULL;
00186 asm volatile("#Stage 1.5: Denoise\n\t");
00187 for (int b=0; b<4; b++) {
00188 if (b==0) ptr_in = G_GR_NOISY(0);
00189 if (b==1) ptr_in = R_R_NOISY(0);
00190 if (b==2) ptr_in = B_B_NOISY(0);
00191 if (b==3) ptr_in = G_GB_NOISY(0);
00192 if (b==0) ptr_out = G_GR(0);
00193 if (b==1) ptr_out = R_R(0);
00194 if (b==2) ptr_out = B_B(0);
00195 if (b==3) ptr_out = G_GB(0);
00196
00197
00198 for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00199 int16x8_t in = vld1q_s16(ptr_in);
00200 vst1q_s16(ptr_out, in);
00201 ptr_in+=8;
00202 ptr_out+=8;
00203 }
00204
00205 for (int y = 1; y < VEC_HEIGHT - 1; y++) {
00206 for (int x = 0; x < VEC_WIDTH/2; x++) {
00207 int16x8_t here = vld1q_s16(ptr_in);
00208 int16x8_t above = vld1q_s16(ptr_in + VEC_WIDTH*4);
00209 int16x8_t under = vld1q_s16(ptr_in - VEC_WIDTH*4);
00210 int16x8_t right = vld1q_s16(ptr_in + 1);
00211 int16x8_t left = vld1q_s16(ptr_in - 1);
00212 int16x8_t max, min;
00213
00214
00215 max = vmaxq_s16(left, right);
00216 max = vmaxq_s16(above, max);
00217 max = vmaxq_s16(under, max);
00218
00219 min = vminq_s16(left, right);
00220 min = vminq_s16(above, min);
00221 min = vminq_s16(under, min);
00222
00223
00224 here = vminq_s16(max, here);
00225 here = vmaxq_s16(min, here);
00226
00227 vst1q_s16(ptr_out, here);
00228 ptr_in += 8;
00229 ptr_out += 8;
00230 }
00231 }
00232
00233
00234 for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00235 int16x8_t in = vld1q_s16(ptr_in);
00236 vst1q_s16(ptr_out, in);
00237 ptr_in+=8;
00238 ptr_out+=8;
00239 }
00240 }
00241 }
00242
00243
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259 if (1) {
00260
00261 int i = VEC_WIDTH*4;
00262
00263 register int16_t *g_gb_up_ptr = G_GB(i) - VEC_WIDTH*4;
00264 register int16_t *g_gb_here_ptr = G_GB(i);
00265 register int16_t *g_gb_left_ptr = G_GB(i) - 1;
00266 register int16_t *g_gr_down_ptr = G_GR(i) + VEC_WIDTH*4;
00267 register int16_t *g_gr_here_ptr = G_GR(i);
00268 register int16_t *g_gr_right_ptr = G_GR(i) + 1;
00269 register int16_t *g_r_ptr = G_R(i);
00270 register int16_t *g_b_ptr = G_B(i);
00271
00272 for (int y = 1; y < VEC_HEIGHT-1; y++) {
00273 for (int x = 0; x < VEC_WIDTH/2; x++) {
00274
00275 asm volatile ("#Stage 2) Green interpolation\n");
00276
00277
00278
00279 int16x8_t gb_up = vld1q_s16(g_gb_up_ptr);
00280 g_gb_up_ptr+=8;
00281 int16x8_t gb_here = vld1q_s16(g_gb_here_ptr);
00282 g_gb_here_ptr+=8;
00283 int16x8_t gb_left = vld1q_s16(g_gb_left_ptr);
00284 g_gb_left_ptr+=8;
00285 int16x8_t gr_down = vld1q_s16(g_gr_down_ptr);
00286 g_gr_down_ptr+=8;
00287 int16x8_t gr_here = vld1q_s16(g_gr_here_ptr);
00288 g_gr_here_ptr+=8;
00289 int16x8_t gr_right = vld1q_s16(g_gr_right_ptr);
00290 g_gr_right_ptr+=8;
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322 int16x8_t gv_r = vhaddq_s16(gb_up, gb_here);
00323 int16x8_t gvd_r = vabdq_s16(gb_up, gb_here);
00324 int16x8_t gh_r = vhaddq_s16(gr_right, gr_here);
00325 int16x8_t ghd_r = vabdq_s16(gr_here, gr_right);
00326 int16x8_t g_r = vbslq_s16(vcltq_s16(ghd_r, gvd_r), gh_r, gv_r);
00327
00328 int16x8_t gv_b = vhaddq_s16(gr_down, gr_here);
00329 int16x8_t gvd_b = vabdq_s16(gr_down, gr_here);
00330 int16x8_t gh_b = vhaddq_s16(gb_left, gb_here);
00331 int16x8_t ghd_b = vabdq_s16(gb_left, gb_here);
00332 int16x8_t g_b = vbslq_s16(vcltq_s16(ghd_b, gvd_b), gh_b, gv_b);
00333
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353
00354
00355
00356
00357
00358
00359
00360
00361
00362 vst1q_s16(g_r_ptr, g_r);
00363 g_r_ptr+=8;
00364 vst1q_s16(g_b_ptr, g_b);
00365 g_b_ptr+=8;
00366 }
00367 }
00368 }
00369 asm volatile ("#End of stage 2 (green interpolation)\n");
00370
00371
00372 if (1) {
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393 int i = 2*VEC_WIDTH*4;
00394
00395 for (int y = 2; y < VEC_HEIGHT-2; y++) {
00396 for (int x = 0; x < VEC_WIDTH; x++) {
00397
00398 asm volatile ("#Stage 4) r/b interpolation\n");
00399
00400
00401 int16x4_t r_here = vld1_s16(R_R(i));
00402 int16x4_t r_left = vld1_s16(R_R(i) - 1);
00403 int16x4_t r_down = vld1_s16(R_R(i) + VEC_WIDTH*4);
00404
00405 int16x4_t g_r_left = vld1_s16(G_R(i) - 1);
00406 int16x4_t g_r_here = vld1_s16(G_R(i));
00407 int16x4_t g_r_down = vld1_s16(G_R(i) + VEC_WIDTH*4);
00408
00409 int16x4_t b_up = vld1_s16(B_B(i) - VEC_WIDTH*4);
00410 int16x4_t b_here = vld1_s16(B_B(i));
00411 int16x4_t b_right = vld1_s16(B_B(i) + 1);
00412
00413 int16x4_t g_b_up = vld1_s16(G_B(i) - VEC_WIDTH*4);
00414 int16x4_t g_b_here = vld1_s16(G_B(i));
00415 int16x4_t g_b_right = vld1_s16(G_B(i) + 1);
00416
00417
00418 int16x4_t gr_here = vld1_s16(G_GR(i));
00419 int16x4_t gb_here = vld1_s16(G_GB(i));
00420
00421 {
00422 int16x4_t r_gr = vadd_s16(vhadd_s16(r_left, r_here),
00423 vsub_s16(gr_here,
00424 vhadd_s16(g_r_left, g_r_here)));
00425 int16x4_t r_gb = vadd_s16(vhadd_s16(r_here, r_down),
00426 vsub_s16(gb_here,
00427 vhadd_s16(g_r_down, g_r_here)));
00428 vst1_s16(R_GR(i), r_gr);
00429 vst1_s16(R_GB(i), r_gb);
00430 }
00431
00432 {
00433 int16x4_t r_downleft = vld1_s16(R_R(i) + VEC_WIDTH*4 - 1);
00434 int16x4_t g_r_downleft = vld1_s16(G_R(i) + VEC_WIDTH*4 - 1);
00435
00436 int16x4_t rp_b = vadd_s16(vhadd_s16(r_downleft, r_here),
00437 vsub_s16(g_b_here,
00438 vhadd_s16(g_r_downleft, g_r_here)));
00439 int16x4_t rn_b = vadd_s16(vhadd_s16(r_left, r_down),
00440 vsub_s16(g_b_here,
00441 vhadd_s16(g_r_left, g_r_down)));
00442 int16x4_t rpd_b = vabd_s16(r_downleft, r_here);
00443 int16x4_t rnd_b = vabd_s16(r_left, r_down);
00444 int16x4_t r_b = vbsl_s16(vclt_s16(rpd_b, rnd_b), rp_b, rn_b);
00445 vst1_s16(R_B(i), r_b);
00446 }
00447
00448 {
00449 int16x4_t b_gr = vadd_s16(vhadd_s16(b_up, b_here),
00450 vsub_s16(gr_here,
00451 vhadd_s16(g_b_up, g_b_here)));
00452 int16x4_t b_gb = vadd_s16(vhadd_s16(b_here, b_right),
00453 vsub_s16(gb_here,
00454 vhadd_s16(g_b_right, g_b_here)));
00455 vst1_s16(B_GR(i), b_gr);
00456 vst1_s16(B_GB(i), b_gb);
00457 }
00458
00459 {
00460 int16x4_t b_upright = vld1_s16(B_B(i) - VEC_WIDTH*4 + 1);
00461 int16x4_t g_b_upright = vld1_s16(G_B(i) - VEC_WIDTH*4 + 1);
00462
00463 int16x4_t bp_r = vadd_s16(vhadd_s16(b_upright, b_here),
00464 vsub_s16(g_r_here,
00465 vhadd_s16(g_b_upright, g_b_here)));
00466 int16x4_t bn_r = vadd_s16(vhadd_s16(b_right, b_up),
00467 vsub_s16(g_r_here,
00468 vhadd_s16(g_b_right, g_b_up)));
00469 int16x4_t bpd_r = vabd_s16(b_upright, b_here);
00470 int16x4_t bnd_r = vabd_s16(b_right, b_up);
00471 int16x4_t b_r = vbsl_s16(vclt_s16(bpd_r, bnd_r), bp_r, bn_r);
00472 vst1_s16(B_R(i), b_r);
00473 }
00474
00475
00476 i += 4;
00477 }
00478 }
00479 asm volatile ("#End of stage 4 - what_ever\n\t");
00480 }
00481
00482
00483 if (1) {
00484
00485
00486 asm volatile ("#Stage 10) Color Correction\n");
00487
00488 uint16_t * __restrict__ out16Ptr = out16;
00489
00490 int i = 2*VEC_WIDTH*4;
00491
00492 const uint16x4_t bound = vdup_n_u16(1023);
00493
00494 for (int y = 2; y < VEC_HEIGHT-2; y++) {
00495
00496
00497
00498 int16x4x2_t r0 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00499 int16x4x2_t g0 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00500 int16x4x2_t b0 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00501 i += 4;
00502
00503 for (int x = 1; x < VEC_WIDTH; x++) {
00504
00505 int16x4x2_t r1 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00506 int16x4x2_t g1 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00507 int16x4x2_t b1 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00508
00509
00510 int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00511 rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00512 rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00513 rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00514
00515 int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00516 gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00517 gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00518 gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00519
00520 int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00521 bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00522 bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00523 bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00524
00525 uint16x4x3_t col16;
00526 col16.val[0] = vqrshrun_n_s32(rout, 8);
00527 col16.val[1] = vqrshrun_n_s32(gout, 8);
00528 col16.val[2] = vqrshrun_n_s32(bout, 8);
00529 col16.val[0] = vmin_u16(col16.val[0], bound);
00530 col16.val[1] = vmin_u16(col16.val[1], bound);
00531 col16.val[2] = vmin_u16(col16.val[2], bound);
00532 vst3_u16(out16Ptr, col16);
00533 out16Ptr += 12;
00534
00535 rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00536 rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00537 rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00538 rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00539
00540 gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00541 gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00542 gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00543 gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00544
00545 bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00546 bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00547 bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00548 bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00549
00550 col16.val[0] = vqrshrun_n_s32(rout, 8);
00551 col16.val[1] = vqrshrun_n_s32(gout, 8);
00552 col16.val[2] = vqrshrun_n_s32(bout, 8);
00553 col16.val[0] = vmin_u16(col16.val[0], bound);
00554 col16.val[1] = vmin_u16(col16.val[1], bound);
00555 col16.val[2] = vmin_u16(col16.val[2], bound);
00556 vst3_u16(out16Ptr, col16);
00557 out16Ptr += 12;
00558
00559 r0 = r1;
00560 g0 = g1;
00561 b0 = b1;
00562
00563 i += 4;
00564 }
00565
00566
00567 i -= VEC_WIDTH*4;
00568
00569 r0 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00570 g0 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00571 b0 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));
00572 i += 4;
00573
00574 for (int x = 1; x < VEC_WIDTH; x++) {
00575 int16x4x2_t r1 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00576 int16x4x2_t g1 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00577 int16x4x2_t b1 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));
00578
00579
00580 int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00581 rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00582 rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00583 rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00584
00585 int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00586 gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00587 gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00588 gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00589
00590 int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00591 bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00592 bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00593 bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00594
00595 uint16x4x3_t col16;
00596 col16.val[0] = vqrshrun_n_s32(rout, 8);
00597 col16.val[1] = vqrshrun_n_s32(gout, 8);
00598 col16.val[2] = vqrshrun_n_s32(bout, 8);
00599 col16.val[0] = vmin_u16(col16.val[0], bound);
00600 col16.val[1] = vmin_u16(col16.val[1], bound);
00601 col16.val[2] = vmin_u16(col16.val[2], bound);
00602 vst3_u16(out16Ptr, col16);
00603 out16Ptr += 12;
00604
00605 rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00606 rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00607 rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00608 rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00609
00610 gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00611 gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00612 gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00613 gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00614
00615 bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00616 bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00617 bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00618 bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00619
00620 col16.val[0] = vqrshrun_n_s32(rout, 8);
00621 col16.val[1] = vqrshrun_n_s32(gout, 8);
00622 col16.val[2] = vqrshrun_n_s32(bout, 8);
00623 col16.val[0] = vmin_u16(col16.val[0], bound);
00624 col16.val[1] = vmin_u16(col16.val[1], bound);
00625 col16.val[2] = vmin_u16(col16.val[2], bound);
00626 vst3_u16(out16Ptr, col16);
00627 out16Ptr += 12;
00628
00629 r0 = r1;
00630 g0 = g1;
00631 b0 = b1;
00632
00633 i += 4;
00634 }
00635 }
00636 asm volatile("#End of stage 10) - color correction\n\t");
00637 }
00638
00639
00640 if (1) {
00641
00642 asm volatile("#Gamma Correction\n");
00643
00644 const uint16_t * __restrict__ out16Ptr = out16;
00645
00646 for (int y = 0; y < BLOCK_HEIGHT; y++) {
00647 unsigned int * __restrict__ outPtr32 = (unsigned int *)(outBlockPtr + y * outWidth * 3);
00648 for (int x = 0; x < (BLOCK_WIDTH*3)/4; x++) {
00649 unsigned val = ((lut[out16Ptr[0]] << 0) |
00650 (lut[out16Ptr[1]] << 8) |
00651 (lut[out16Ptr[2]] << 16) |
00652 (lut[out16Ptr[3]] << 24));
00653 *outPtr32++ = val;
00654 out16Ptr += 4;
00655
00656 }
00657 }
00658 asm volatile("#end of Gamma Correction\n");
00659
00660
00661
00662
00663
00664
00665
00666
00667
00668
00669
00670 }
00671
00672
00673 blockPtr += BLOCK_WIDTH;
00674 outBlockPtr += BLOCK_WIDTH*3;
00675 }
00676 }
00677
00678
00679 return out;
00680 }
00681
00682 Image makeThumbnailRAW_ARM(Frame src, float contrast, int blackLevel, float gamma) {
00683
00684
00685
00686
00687
00688
00689
00690
00691
00692
00693
00694
00695
00696 Image thumb(640, 480, RGB24);
00697 const unsigned int w = 2592, tw = 640;
00698 const unsigned int h = 1968, th = 480;
00699 const unsigned int scale = 4;
00700 const unsigned int cw = tw*scale;
00701 const unsigned int ch = th*scale;
00702 const unsigned int startX = (w-cw)/2;
00703 const unsigned int startY = (h-ch)/2;
00704 const unsigned int bytesPerRow = src.image().bytesPerRow();
00705
00706
00707 unsigned char lut[4096];
00708 makeLUT(src, contrast, blackLevel, gamma, lut);
00709
00710 unsigned char *row = src.image()(startX, startY);
00711
00712 Time startTime = Time::now();
00713 float colorMatrix_f[12];
00714 src.rawToRGBColorMatrix(colorMatrix_f);
00715
00716 register int16x4_t colorMatrix0 asm ("d0");
00717 register int16x4_t colorMatrix1 asm ("d1");
00718 register int16x4_t colorMatrix2 asm ("d2");
00719 register int16x4_t wCoord asm ("d20");
00720 register int16x4_t maxValue asm ("d21");
00721 register int16x4_t minValue asm ("d22");
00722
00723 asm volatile(
00724
00725 "vldm %[colorMatrix_f], {q2,q3,q4} \n\t"
00726 "vcvt.s32.f32 q2, q2, #8 \n\t"
00727 "vcvt.s32.f32 q3, q3, #8 \n\t"
00728 "vcvt.s32.f32 q4, q4, #8 \n\t"
00729 "vmovn.i32 d0, q2 \n\t"
00730 "vmovn.i32 d1, q3 \n\t"
00731 "vmovn.i32 d2, q4 \n\t"
00732
00733 "vmov.i16 d20, #0x4 \n\t"
00734 "vmov.i16 d21, #0x00FF \n\t"
00735 "vorr.i16 d21, #0x0300 \n\t"
00736 "vmov.i16 d22, #0x0 \n\t"
00737 : [colorMatrix0] "=w" (colorMatrix0),
00738 [colorMatrix1] "=w" (colorMatrix1),
00739 [colorMatrix2] "=w" (colorMatrix2),
00740 [wCoord] "=w" (wCoord),
00741 [maxValue] "=w" (maxValue),
00742 [minValue] "=w" (minValue)
00743 : [colorMatrix_f] "r" (colorMatrix_f)
00744 : "memory",
00745 "d3", "d4", "d5", "d6", "d7", "d8", "d9");
00746
00747 for (unsigned int ty = 0; ty <480; ty++, row+=4*bytesPerRow) {
00748 register unsigned short *px0 = (unsigned short *)row;
00749 register unsigned short *px1 = (unsigned short *)(row+1*bytesPerRow);
00750 register unsigned short *px2 = (unsigned short *)(row+2*bytesPerRow);
00751 register unsigned short *px3 = (unsigned short *)(row+3*bytesPerRow);
00752
00753 register unsigned char *dst = thumb(0,ty);
00754 for (register unsigned int tx =0; tx < 640; tx+=scale) {
00755
00756 asm volatile(
00757
00758
00759
00760
00762 "vld2.16 {d4-d7}, [%[px0]]! \n\t"
00763 "vld2.16 {d8-d11}, [%[px1]]! \n\t"
00764 "vld2.16 {d12-d15}, [%[px2]]! \n\t"
00765 "vld2.16 {d16-d19}, [%[px3]]! \n\t"
00766
00767
00768
00769
00770
00771
00772
00773
00774
00776 "vpadd.u16 d4, d4, d5 \n\t"
00777 "vpadd.u16 d5, d6, d7 \n\t"
00778 "vpadd.u16 d6, d8, d9 \n\t"
00779 "vpadd.u16 d7, d10, d11 \n\t"
00780 "vpadd.u16 d8, d12, d13 \n\t"
00781 "vpadd.u16 d9, d14, d15 \n\t"
00782 "vpadd.u16 d10, d16, d17 \n\t"
00783 "vpadd.u16 d11, d18, d19 \n\t"
00784
00785
00786
00787
00788
00790 "vadd.u16 d7, d8 \n\t"
00791 "vadd.u16 d4, d11 \n\t"
00792 "vhadd.u16 d4, d7 \n\t"
00794 "vadd.u16 d5, d9 \n\t"
00796 "vadd.u16 d6, d10 \n\t"
00797
00798
00799
00800
00801
00802
00803
00804
00805
00807
00808 "vmull.s16 q4, d5, d0[0] \n\t"
00809 "vmlal.s16 q4, d4, d0[1] \n\t"
00810 "vmlal.s16 q4, d6, d0[2] \n\t"
00811 "vmlal.s16 q4, d20, d0[3] \n\t"
00812
00813 "vmull.s16 q5, d5, d1[0] \n\t"
00814 "vmlal.s16 q5, d4, d1[1] \n\t"
00815 "vmlal.s16 q5, d6, d1[2] \n\t"
00816 "vmlal.s16 q5, d20, d1[3] \n\t"
00817
00818 "vmull.s16 q6, d5, d2[0] \n\t"
00819 "vmlal.s16 q6, d4, d2[1] \n\t"
00820 "vmlal.s16 q6, d6, d2[2] \n\t"
00821 "vmlal.s16 q6, d20, d2[3] \n\t"
00822
00823
00824
00825
00827 "vrshrn.s32 d3, q4, #10 \n\t"
00828 "vrshrn.s32 d4, q5, #10 \n\t"
00829 "vrshrn.s32 d5, q6, #10 \n\t"
00831 "vmin.s16 d3, d3, d21 \n\t"
00832 "vmin.s16 d4, d4, d21 \n\t"
00833 "vmin.s16 d5, d5, d21 \n\t"
00834 "vmax.s16 d3, d3, d22 \n\t"
00835 "vmax.s16 d4, d4, d22 \n\t"
00836 "vmax.s16 d5, d5, d22 \n\t"
00837
00838
00839
00840
00842 "vmov r0,r1, d3 \n\t"
00843
00844
00845 "uxth r2, r0 \n\t"
00846 "ldrb r4, [%[gammaTable], r2] \n\t"
00847
00848 "uxth r2, r0, ROR #16 \n\t"
00849 "ldrb r3, [%[gammaTable], r2] \n\t"
00850 "orr r4, r4, r3, LSL #24 \n\t"
00851
00852 "uxth r2, r1 \n\t"
00853 "ldrb r3, [%[gammaTable], r2] \n\t"
00854 "mov r5, r3, LSL #16 \n\t"
00855
00856 "uxth r2, r1, ROR #16 \n\t"
00857 "ldrb r3, [%[gammaTable], r2] \n\t"
00858 "mov r6, r3, LSL #8 \n\t"
00859
00860
00861
00862
00863 "vmov r0,r1, d4 \n\t"
00864
00865
00866 "uxth r2, r0 \n\t"
00867 "ldrb r3, [%[gammaTable], r2] \n\t"
00868 "orr r4, r4, r3, LSL #8 \n\t"
00869
00870 "uxth r2, r0, ROR #16 \n\t"
00871 "ldrb r3, [%[gammaTable], r2] \n\t"
00872 "orr r5, r5, r3 \n\t"
00873
00874 "uxth r2, r1 \n\t"
00875 "ldrb r3, [%[gammaTable], r2] \n\t"
00876 "orr r5, r5, r3, LSL #24 \n\t"
00877
00878 "uxth r2, r1, ROR #16 \n\t"
00879 "ldrb r3, [%[gammaTable], r2] \n\t"
00880 "orr r6, r6, r3, LSL #16 \n\t"
00881
00882
00883
00884
00885 "vmov r0,r1, d5 \n\t"
00886
00887
00888 "uxth r2, r0 \n\t"
00889 "ldrb r3, [%[gammaTable], r2] \n\t"
00890 "orr r4, r4, r3, LSL #16 \n\t"
00891
00892 "uxth r2, r0, ROR #16 \n\t"
00893 "ldrb r3, [%[gammaTable], r2] \n\t"
00894 "orr r5, r5, r3, LSL #8 \n\t"
00895
00896 "uxth r2, r1 \n\t"
00897 "ldrb r3, [%[gammaTable], r2] \n\t"
00898 "orr r6, r6, r3 \n\t"
00899
00900 "uxth r2, r1, ROR #16 \n\t"
00901 "ldrb r3, [%[gammaTable], r2] \n\t"
00902 "orr r6, r6, r3, LSL #24 \n\t"
00903
00904
00905
00906
00907 "stm %[dst]!, {r4,r5,r6} \n\t"
00908 : [px0] "+&r" (px0),
00909 [px1] "+&r" (px1),
00910 [px2] "+&r" (px2),
00911 [px3] "+&r" (px3),
00912 [dst] "+&r" (dst)
00913 : [gammaTable] "r" (lut),
00914 [colorMatrix0] "w" (colorMatrix0),
00915 [colorMatrix1] "w" (colorMatrix1),
00916 [colorMatrix2] "w" (colorMatrix2),
00917 [wCoord] "w" (wCoord),
00918 [maxValue] "w" (maxValue),
00919 [minValue] "w" (minValue)
00920 : "memory",
00921 "r0", "r1", "r2", "r3", "r4", "r5", "r6",
00922 "d3", "d4", "d5", "d6",
00923 "d7", "d8", "d9", "d10",
00924 "d11", "d12", "d13", "d14",
00925 "d15", "d16", "d17", "d18", "d19"
00926 );
00927
00928 }
00929 }
00930
00931
00932
00933 return thumb;
00934 }
00935 }
00936
00937
00938 #endif