00001 #ifdef FCAM_ARCH_ARM
00002 #include "Demosaic_ARM.h"
00003 #include <arm_neon.h>
00004
00005 namespace FCam {
00006
00007
00008 extern void makeLUT(const Frame &f, float contrast, int blackLevel, float gamma, unsigned char *lut);
00009
00010 Image demosaic_ARM(Frame src, float contrast, bool denoise, int blackLevel, float gamma) {
00011
00012 const int BLOCK_WIDTH = 40;
00013 const int BLOCK_HEIGHT = 24;
00014
00015 Image input = src.image();
00016
00017
00018 switch((int)src.platform().bayerPattern()) {
00019 case GRBG:
00020 break;
00021 case RGGB:
00022 input = input.subImage(1, 0, Size(input.width()-2, input.height()));
00023 break;
00024 case BGGR:
00025 input = input.subImage(0, 1, Size(input.width(), input.height()-2));
00026 break;
00027 case GBRG:
00028 input = input.subImage(1, 1, Size(input.width()-2, input.height()-2));
00029 default:
00030 error(Event::DemosaicError, "Can't demosaic from a non-bayer sensor\n");
00031 return Image();
00032 }
00033
00034 int rawWidth = input.width();
00035 int rawHeight = input.height();
00036
00037 const int VEC_WIDTH = ((BLOCK_WIDTH + 8)/8);
00038 const int VEC_HEIGHT = ((BLOCK_HEIGHT + 8)/2);
00039
00040 int rawPixelsPerRow = input.bytesPerRow()/2 ;
00041
00042 int outWidth = rawWidth-8;
00043 int outHeight = rawHeight-8;
00044 outWidth /= BLOCK_WIDTH;
00045 outWidth *= BLOCK_WIDTH;
00046 outHeight /= BLOCK_HEIGHT;
00047 outHeight *= BLOCK_HEIGHT;
00048
00049 Image out(outWidth, outHeight, RGB24);
00050
00051
00052 if (((input.width() - 8) != (unsigned)outWidth) ||
00053 ((input.height() - 8) != (unsigned)outHeight)) {
00054 int offX = (input.width() - 8 - outWidth)/2;
00055 int offY = (input.height() - 8 - outHeight)/2;
00056 offX -= offX&1;
00057 offY -= offY&1;
00058
00059 if (offX || offY) {
00060 input = input.subImage(offX, offY, Size(outWidth+8, outHeight+8));
00061 }
00062 }
00063
00064 Time startTime = Time::now();
00065
00066
00067 float colorMatrix_f[12];
00068
00069
00070 if (src.shot().colorMatrix().size() == 12) {
00071 for (int i = 0; i < 12; i++) {
00072 colorMatrix_f[i] = src.shot().colorMatrix()[i];
00073 }
00074 } else {
00075
00076 src.platform().rawToRGBColorMatrix(src.shot().whiteBalance, colorMatrix_f);
00077 }
00078
00079 int16x4_t colorMatrix[3];
00080 for (int i = 0; i < 3; i++) {
00081 int16_t val = (int16_t)(colorMatrix_f[i*4+0] * 256 + 0.5);
00082 colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 0);
00083 val = (int16_t)(colorMatrix_f[i*4+1] * 256 + 0.5);
00084 colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 1);
00085 val = (int16_t)(colorMatrix_f[i*4+2] * 256 + 0.5);
00086 colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 2);
00087 val = (int16_t)(colorMatrix_f[i*4+3] * 256 + 0.5);
00088 colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 3);
00089 }
00090
00091
00092
00093 uint16_t out16[BLOCK_WIDTH*BLOCK_HEIGHT*3];
00094
00095
00096
00097 int16_t scratch[VEC_WIDTH*VEC_HEIGHT*4*12];
00098
00099 #define R_R_OFF (VEC_WIDTH*VEC_HEIGHT*4*0)
00100 #define R_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*1)
00101 #define R_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*2)
00102 #define R_B_OFF (VEC_WIDTH*VEC_HEIGHT*4*3)
00103
00104 #define G_R_OFF (VEC_WIDTH*VEC_HEIGHT*4*4)
00105 #define G_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*5)
00106 #define G_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*6)
00107 #define G_B_OFF (VEC_WIDTH*VEC_HEIGHT*4*7)
00108
00109 #define B_R_OFF (VEC_WIDTH*VEC_HEIGHT*4*8)
00110 #define B_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*9)
00111 #define B_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*10)
00112 #define B_B_OFF (VEC_WIDTH*VEC_HEIGHT*4*11)
00113
00114 #define R_R(i) (scratch+(i)+R_R_OFF)
00115 #define R_GR(i) (scratch+(i)+R_GR_OFF)
00116 #define R_GB(i) (scratch+(i)+R_GB_OFF)
00117 #define R_B(i) (scratch+(i)+R_B_OFF)
00118
00119 #define G_R(i) (scratch+(i)+G_R_OFF)
00120 #define G_GR(i) (scratch+(i)+G_GR_OFF)
00121 #define G_GB(i) (scratch+(i)+G_GB_OFF)
00122 #define G_B(i) (scratch+(i)+G_B_OFF)
00123
00124 #define B_R(i) (scratch+(i)+B_R_OFF)
00125 #define B_GR(i) (scratch+(i)+B_GR_OFF)
00126 #define B_GB(i) (scratch+(i)+B_GB_OFF)
00127 #define B_B(i) (scratch+(i)+B_B_OFF)
00128
00129
00130 #define G_GR_NOISY B_GR
00131 #define B_B_NOISY G_B
00132 #define R_R_NOISY G_R
00133 #define G_GB_NOISY B_GB
00134
00135
00136 unsigned char lut[4096];
00137 makeLUT(src, contrast, blackLevel, gamma, lut);
00138
00139
00140 for (int by = 0; by < rawHeight-8-BLOCK_HEIGHT+1; by += BLOCK_HEIGHT) {
00141 const short * __restrict__ blockPtr = (const short *)input(0,by);
00142 unsigned char * __restrict__ outBlockPtr = out(0, by);
00143 for (int bx = 0; bx < rawWidth-8-BLOCK_WIDTH+1; bx += BLOCK_WIDTH) {
00144
00145
00146 if (1) {
00147 register const int16_t * __restrict__ rawPtr = blockPtr;
00148 register const int16_t * __restrict__ rawPtr2 = blockPtr + rawPixelsPerRow;
00149
00150 register const int rawJump = rawPixelsPerRow*2 - VEC_WIDTH*8;
00151
00152 register int16_t * __restrict__ g_gr_ptr = denoise ? G_GR_NOISY(0) : G_GR(0);
00153 register int16_t * __restrict__ r_r_ptr = denoise ? R_R_NOISY(0) : R_R(0);
00154 register int16_t * __restrict__ b_b_ptr = denoise ? B_B_NOISY(0) : B_B(0);
00155 register int16_t * __restrict__ g_gb_ptr = denoise ? G_GB_NOISY(0) : G_GB(0);
00156
00157 for (int y = 0; y < VEC_HEIGHT; y++) {
00158 for (int x = 0; x < VEC_WIDTH/2; x++) {
00159
00160 asm volatile ("# Stage 1) Demux\n");
00161
00162
00163
00164
00165 asm volatile (
00166 "vld2.16 {d6-d9}, [%[rawPtr]]! \n\t"
00167 "vld2.16 {d10-d13}, [%[rawPtr2]]! \n\t"
00168 "vst1.16 {d6-d7}, [%[g_gr_ptr]]! \n\t"
00169 "vst1.16 {d8-d9}, [%[r_r_ptr]]! \n\t"
00170 "vst1.16 {d10-d11}, [%[b_b_ptr]]! \n\t"
00171 "vst1.16 {d12-d13}, [%[g_gb_ptr]]! \n\t" :
00172 [rawPtr]"+r"(rawPtr),
00173 [rawPtr2]"+r"(rawPtr2),
00174 [g_gr_ptr]"+r"(g_gr_ptr),
00175 [r_r_ptr]"+r"(r_r_ptr),
00176 [b_b_ptr]"+r"(b_b_ptr),
00177 [g_gb_ptr]"+r"(g_gb_ptr) ::
00178 "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "memory");
00179
00180 }
00181
00182 rawPtr += rawJump;
00183 rawPtr2 += rawJump;
00184 }
00185 }
00186
00187
00188
00189
00190
00191 if (denoise) {
00192 register int16_t * __restrict__ ptr_in = NULL;
00193 register int16_t * __restrict__ ptr_out = NULL;
00194 asm volatile("#Stage 1.5: Denoise\n\t");
00195 for (int b=0; b<4; b++) {
00196 if (b==0) ptr_in = G_GR_NOISY(0);
00197 if (b==1) ptr_in = R_R_NOISY(0);
00198 if (b==2) ptr_in = B_B_NOISY(0);
00199 if (b==3) ptr_in = G_GB_NOISY(0);
00200 if (b==0) ptr_out = G_GR(0);
00201 if (b==1) ptr_out = R_R(0);
00202 if (b==2) ptr_out = B_B(0);
00203 if (b==3) ptr_out = G_GB(0);
00204
00205
00206 for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00207 int16x8_t in = vld1q_s16(ptr_in);
00208 vst1q_s16(ptr_out, in);
00209 ptr_in+=8;
00210 ptr_out+=8;
00211 }
00212
00213 for (int y = 1; y < VEC_HEIGHT - 1; y++) {
00214 for (int x = 0; x < VEC_WIDTH/2; x++) {
00215 int16x8_t here = vld1q_s16(ptr_in);
00216 int16x8_t above = vld1q_s16(ptr_in + VEC_WIDTH*4);
00217 int16x8_t under = vld1q_s16(ptr_in - VEC_WIDTH*4);
00218 int16x8_t right = vld1q_s16(ptr_in + 1);
00219 int16x8_t left = vld1q_s16(ptr_in - 1);
00220 int16x8_t max, min;
00221
00222
00223 max = vmaxq_s16(left, right);
00224 max = vmaxq_s16(above, max);
00225 max = vmaxq_s16(under, max);
00226
00227 min = vminq_s16(left, right);
00228 min = vminq_s16(above, min);
00229 min = vminq_s16(under, min);
00230
00231
00232 here = vminq_s16(max, here);
00233 here = vmaxq_s16(min, here);
00234
00235 vst1q_s16(ptr_out, here);
00236 ptr_in += 8;
00237 ptr_out += 8;
00238 }
00239 }
00240
00241
00242 for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00243 int16x8_t in = vld1q_s16(ptr_in);
00244 vst1q_s16(ptr_out, in);
00245 ptr_in+=8;
00246 ptr_out+=8;
00247 }
00248 }
00249 }
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267 if (1) {
00268
00269 int i = VEC_WIDTH*4;
00270
00271 register int16_t *g_gb_up_ptr = G_GB(i) - VEC_WIDTH*4;
00272 register int16_t *g_gb_here_ptr = G_GB(i);
00273 register int16_t *g_gb_left_ptr = G_GB(i) - 1;
00274 register int16_t *g_gr_down_ptr = G_GR(i) + VEC_WIDTH*4;
00275 register int16_t *g_gr_here_ptr = G_GR(i);
00276 register int16_t *g_gr_right_ptr = G_GR(i) + 1;
00277 register int16_t *g_r_ptr = G_R(i);
00278 register int16_t *g_b_ptr = G_B(i);
00279
00280 for (int y = 1; y < VEC_HEIGHT-1; y++) {
00281 for (int x = 0; x < VEC_WIDTH/2; x++) {
00282
00283 asm volatile ("#Stage 2) Green interpolation\n");
00284
00285
00286
00287 int16x8_t gb_up = vld1q_s16(g_gb_up_ptr);
00288 g_gb_up_ptr+=8;
00289 int16x8_t gb_here = vld1q_s16(g_gb_here_ptr);
00290 g_gb_here_ptr+=8;
00291 int16x8_t gb_left = vld1q_s16(g_gb_left_ptr);
00292 g_gb_left_ptr+=8;
00293 int16x8_t gr_down = vld1q_s16(g_gr_down_ptr);
00294 g_gr_down_ptr+=8;
00295 int16x8_t gr_here = vld1q_s16(g_gr_here_ptr);
00296 g_gr_here_ptr+=8;
00297 int16x8_t gr_right = vld1q_s16(g_gr_right_ptr);
00298 g_gr_right_ptr+=8;
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330 int16x8_t gv_r = vhaddq_s16(gb_up, gb_here);
00331 int16x8_t gvd_r = vabdq_s16(gb_up, gb_here);
00332 int16x8_t gh_r = vhaddq_s16(gr_right, gr_here);
00333 int16x8_t ghd_r = vabdq_s16(gr_here, gr_right);
00334 int16x8_t g_r = vbslq_s16(vcltq_s16(ghd_r, gvd_r), gh_r, gv_r);
00335
00336 int16x8_t gv_b = vhaddq_s16(gr_down, gr_here);
00337 int16x8_t gvd_b = vabdq_s16(gr_down, gr_here);
00338 int16x8_t gh_b = vhaddq_s16(gb_left, gb_here);
00339 int16x8_t ghd_b = vabdq_s16(gb_left, gb_here);
00340 int16x8_t g_b = vbslq_s16(vcltq_s16(ghd_b, gvd_b), gh_b, gv_b);
00341
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369
00370 vst1q_s16(g_r_ptr, g_r);
00371 g_r_ptr+=8;
00372 vst1q_s16(g_b_ptr, g_b);
00373 g_b_ptr+=8;
00374 }
00375 }
00376 }
00377 asm volatile ("#End of stage 2 (green interpolation)\n");
00378
00379
00380 if (1) {
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401 int i = 2*VEC_WIDTH*4;
00402
00403 for (int y = 2; y < VEC_HEIGHT-2; y++) {
00404 for (int x = 0; x < VEC_WIDTH; x++) {
00405
00406 asm volatile ("#Stage 4) r/b interpolation\n");
00407
00408
00409 int16x4_t r_here = vld1_s16(R_R(i));
00410 int16x4_t r_left = vld1_s16(R_R(i) - 1);
00411 int16x4_t r_down = vld1_s16(R_R(i) + VEC_WIDTH*4);
00412
00413 int16x4_t g_r_left = vld1_s16(G_R(i) - 1);
00414 int16x4_t g_r_here = vld1_s16(G_R(i));
00415 int16x4_t g_r_down = vld1_s16(G_R(i) + VEC_WIDTH*4);
00416
00417 int16x4_t b_up = vld1_s16(B_B(i) - VEC_WIDTH*4);
00418 int16x4_t b_here = vld1_s16(B_B(i));
00419 int16x4_t b_right = vld1_s16(B_B(i) + 1);
00420
00421 int16x4_t g_b_up = vld1_s16(G_B(i) - VEC_WIDTH*4);
00422 int16x4_t g_b_here = vld1_s16(G_B(i));
00423 int16x4_t g_b_right = vld1_s16(G_B(i) + 1);
00424
00425
00426 int16x4_t gr_here = vld1_s16(G_GR(i));
00427 int16x4_t gb_here = vld1_s16(G_GB(i));
00428
00429 {
00430 int16x4_t r_gr = vadd_s16(vhadd_s16(r_left, r_here),
00431 vsub_s16(gr_here,
00432 vhadd_s16(g_r_left, g_r_here)));
00433 int16x4_t r_gb = vadd_s16(vhadd_s16(r_here, r_down),
00434 vsub_s16(gb_here,
00435 vhadd_s16(g_r_down, g_r_here)));
00436 vst1_s16(R_GR(i), r_gr);
00437 vst1_s16(R_GB(i), r_gb);
00438 }
00439
00440 {
00441 int16x4_t r_downleft = vld1_s16(R_R(i) + VEC_WIDTH*4 - 1);
00442 int16x4_t g_r_downleft = vld1_s16(G_R(i) + VEC_WIDTH*4 - 1);
00443
00444 int16x4_t rp_b = vadd_s16(vhadd_s16(r_downleft, r_here),
00445 vsub_s16(g_b_here,
00446 vhadd_s16(g_r_downleft, g_r_here)));
00447 int16x4_t rn_b = vadd_s16(vhadd_s16(r_left, r_down),
00448 vsub_s16(g_b_here,
00449 vhadd_s16(g_r_left, g_r_down)));
00450 int16x4_t rpd_b = vabd_s16(r_downleft, r_here);
00451 int16x4_t rnd_b = vabd_s16(r_left, r_down);
00452 int16x4_t r_b = vbsl_s16(vclt_s16(rpd_b, rnd_b), rp_b, rn_b);
00453 vst1_s16(R_B(i), r_b);
00454 }
00455
00456 {
00457 int16x4_t b_gr = vadd_s16(vhadd_s16(b_up, b_here),
00458 vsub_s16(gr_here,
00459 vhadd_s16(g_b_up, g_b_here)));
00460 int16x4_t b_gb = vadd_s16(vhadd_s16(b_here, b_right),
00461 vsub_s16(gb_here,
00462 vhadd_s16(g_b_right, g_b_here)));
00463 vst1_s16(B_GR(i), b_gr);
00464 vst1_s16(B_GB(i), b_gb);
00465 }
00466
00467 {
00468 int16x4_t b_upright = vld1_s16(B_B(i) - VEC_WIDTH*4 + 1);
00469 int16x4_t g_b_upright = vld1_s16(G_B(i) - VEC_WIDTH*4 + 1);
00470
00471 int16x4_t bp_r = vadd_s16(vhadd_s16(b_upright, b_here),
00472 vsub_s16(g_r_here,
00473 vhadd_s16(g_b_upright, g_b_here)));
00474 int16x4_t bn_r = vadd_s16(vhadd_s16(b_right, b_up),
00475 vsub_s16(g_r_here,
00476 vhadd_s16(g_b_right, g_b_up)));
00477 int16x4_t bpd_r = vabd_s16(b_upright, b_here);
00478 int16x4_t bnd_r = vabd_s16(b_right, b_up);
00479 int16x4_t b_r = vbsl_s16(vclt_s16(bpd_r, bnd_r), bp_r, bn_r);
00480 vst1_s16(B_R(i), b_r);
00481 }
00482
00483
00484 i += 4;
00485 }
00486 }
00487 asm volatile ("#End of stage 4 - what_ever\n\t");
00488 }
00489
00490
00491 if (1) {
00492
00493
00494 asm volatile ("#Stage 10) Color Correction\n");
00495
00496 uint16_t * __restrict__ out16Ptr = out16;
00497
00498 int i = 2*VEC_WIDTH*4;
00499
00500 const uint16x4_t bound = vdup_n_u16(1023);
00501
00502 for (int y = 2; y < VEC_HEIGHT-2; y++) {
00503
00504
00505
00506 int16x4x2_t r0 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00507 int16x4x2_t g0 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00508 int16x4x2_t b0 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00509 i += 4;
00510
00511 for (int x = 1; x < VEC_WIDTH; x++) {
00512
00513 int16x4x2_t r1 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00514 int16x4x2_t g1 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00515 int16x4x2_t b1 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00516
00517
00518 int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00519 rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00520 rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00521 rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00522
00523 int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00524 gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00525 gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00526 gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00527
00528 int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00529 bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00530 bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00531 bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00532
00533 uint16x4x3_t col16;
00534 col16.val[0] = vqrshrun_n_s32(rout, 8);
00535 col16.val[1] = vqrshrun_n_s32(gout, 8);
00536 col16.val[2] = vqrshrun_n_s32(bout, 8);
00537 col16.val[0] = vmin_u16(col16.val[0], bound);
00538 col16.val[1] = vmin_u16(col16.val[1], bound);
00539 col16.val[2] = vmin_u16(col16.val[2], bound);
00540 vst3_u16(out16Ptr, col16);
00541 out16Ptr += 12;
00542
00543 rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00544 rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00545 rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00546 rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00547
00548 gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00549 gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00550 gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00551 gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00552
00553 bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00554 bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00555 bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00556 bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00557
00558 col16.val[0] = vqrshrun_n_s32(rout, 8);
00559 col16.val[1] = vqrshrun_n_s32(gout, 8);
00560 col16.val[2] = vqrshrun_n_s32(bout, 8);
00561 col16.val[0] = vmin_u16(col16.val[0], bound);
00562 col16.val[1] = vmin_u16(col16.val[1], bound);
00563 col16.val[2] = vmin_u16(col16.val[2], bound);
00564 vst3_u16(out16Ptr, col16);
00565 out16Ptr += 12;
00566
00567 r0 = r1;
00568 g0 = g1;
00569 b0 = b1;
00570
00571 i += 4;
00572 }
00573
00574
00575 i -= VEC_WIDTH*4;
00576
00577 r0 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00578 g0 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00579 b0 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));
00580 i += 4;
00581
00582 for (int x = 1; x < VEC_WIDTH; x++) {
00583 int16x4x2_t r1 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00584 int16x4x2_t g1 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00585 int16x4x2_t b1 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));
00586
00587
00588 int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00589 rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00590 rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00591 rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00592
00593 int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00594 gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00595 gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00596 gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00597
00598 int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00599 bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00600 bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00601 bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00602
00603 uint16x4x3_t col16;
00604 col16.val[0] = vqrshrun_n_s32(rout, 8);
00605 col16.val[1] = vqrshrun_n_s32(gout, 8);
00606 col16.val[2] = vqrshrun_n_s32(bout, 8);
00607 col16.val[0] = vmin_u16(col16.val[0], bound);
00608 col16.val[1] = vmin_u16(col16.val[1], bound);
00609 col16.val[2] = vmin_u16(col16.val[2], bound);
00610 vst3_u16(out16Ptr, col16);
00611 out16Ptr += 12;
00612
00613 rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00614 rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00615 rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00616 rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00617
00618 gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00619 gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00620 gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00621 gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00622
00623 bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00624 bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00625 bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00626 bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00627
00628 col16.val[0] = vqrshrun_n_s32(rout, 8);
00629 col16.val[1] = vqrshrun_n_s32(gout, 8);
00630 col16.val[2] = vqrshrun_n_s32(bout, 8);
00631 col16.val[0] = vmin_u16(col16.val[0], bound);
00632 col16.val[1] = vmin_u16(col16.val[1], bound);
00633 col16.val[2] = vmin_u16(col16.val[2], bound);
00634 vst3_u16(out16Ptr, col16);
00635 out16Ptr += 12;
00636
00637 r0 = r1;
00638 g0 = g1;
00639 b0 = b1;
00640
00641 i += 4;
00642 }
00643 }
00644 asm volatile("#End of stage 10) - color correction\n\t");
00645 }
00646
00647
00648 if (1) {
00649
00650 asm volatile("#Gamma Correction\n");
00651
00652 const uint16_t * __restrict__ out16Ptr = out16;
00653
00654 for (int y = 0; y < BLOCK_HEIGHT; y++) {
00655 unsigned int * __restrict__ outPtr32 = (unsigned int *)(outBlockPtr + y * outWidth * 3);
00656 for (int x = 0; x < (BLOCK_WIDTH*3)/4; x++) {
00657 unsigned val = ((lut[out16Ptr[0]] << 0) |
00658 (lut[out16Ptr[1]] << 8) |
00659 (lut[out16Ptr[2]] << 16) |
00660 (lut[out16Ptr[3]] << 24));
00661 *outPtr32++ = val;
00662 out16Ptr += 4;
00663
00664 }
00665 }
00666 asm volatile("#end of Gamma Correction\n");
00667
00668
00669
00670
00671
00672
00673
00674
00675
00676
00677
00678 }
00679
00680
00681 blockPtr += BLOCK_WIDTH;
00682 outBlockPtr += BLOCK_WIDTH*3;
00683 }
00684 }
00685
00686
00687 return out;
00688 }
00689
00690 Image makeThumbnailRAW_ARM(Frame src, float contrast, int blackLevel, float gamma) {
00691
00692
00693
00694
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704 Image thumb(640, 480, RGB24);
00705 const unsigned int w = 2592, tw = 640;
00706 const unsigned int h = 1968, th = 480;
00707 const unsigned int scale = 4;
00708 const unsigned int cw = tw*scale;
00709 const unsigned int ch = th*scale;
00710 const unsigned int startX = (w-cw)/2;
00711 const unsigned int startY = (h-ch)/2;
00712 const unsigned int bytesPerRow = src.image().bytesPerRow();
00713
00714
00715 unsigned char lut[4096];
00716 makeLUT(src, contrast, blackLevel, gamma, lut);
00717
00718 unsigned char *row = src.image()(startX, startY);
00719
00720 Time startTime = Time::now();
00721 float colorMatrix_f[12];
00722
00723
00724 if (src.shot().colorMatrix().size() == 12) {
00725 for (int i = 0; i < 12; i++) {
00726 colorMatrix_f[i] = src.shot().colorMatrix()[i];
00727 }
00728 printf("Making thumbnail with custom WB\n");
00729 } else {
00730
00731 src.platform().rawToRGBColorMatrix(src.shot().whiteBalance, colorMatrix_f);
00732 printf("Making thumbnail with platform WB\n");
00733 }
00734
00735 register int16x4_t colorMatrix0 asm ("d0");
00736 register int16x4_t colorMatrix1 asm ("d1");
00737 register int16x4_t colorMatrix2 asm ("d2");
00738 register int16x4_t wCoord asm ("d20");
00739 register int16x4_t maxValue asm ("d21");
00740 register int16x4_t minValue asm ("d22");
00741
00742 asm volatile(
00743
00744 "vldm %[colorMatrix_f], {q2,q3,q4} \n\t"
00745 "vcvt.s32.f32 q2, q2, #8 \n\t"
00746 "vcvt.s32.f32 q3, q3, #8 \n\t"
00747 "vcvt.s32.f32 q4, q4, #8 \n\t"
00748 "vmovn.i32 d0, q2 \n\t"
00749 "vmovn.i32 d1, q3 \n\t"
00750 "vmovn.i32 d2, q4 \n\t"
00751
00752 "vmov.i16 d20, #0x4 \n\t"
00753 "vmov.i16 d21, #0x00FF \n\t"
00754 "vorr.i16 d21, #0x0300 \n\t"
00755 "vmov.i16 d22, #0x0 \n\t"
00756 : [colorMatrix0] "=w" (colorMatrix0),
00757 [colorMatrix1] "=w" (colorMatrix1),
00758 [colorMatrix2] "=w" (colorMatrix2),
00759 [wCoord] "=w" (wCoord),
00760 [maxValue] "=w" (maxValue),
00761 [minValue] "=w" (minValue)
00762 : [colorMatrix_f] "r" (colorMatrix_f)
00763 : "memory",
00764 "d3", "d4", "d5", "d6", "d7", "d8", "d9");
00765
00766 for (unsigned int ty = 0; ty <480; ty++, row+=4*bytesPerRow) {
00767 register unsigned short *px0 = (unsigned short *)row;
00768 register unsigned short *px1 = (unsigned short *)(row+1*bytesPerRow);
00769 register unsigned short *px2 = (unsigned short *)(row+2*bytesPerRow);
00770 register unsigned short *px3 = (unsigned short *)(row+3*bytesPerRow);
00771
00772 register unsigned char *dst = thumb(0,ty);
00773 for (register unsigned int tx =0; tx < 640; tx+=scale) {
00774
00775 asm volatile(
00776
00777
00778
00779
00781 "vld2.16 {d4-d7}, [%[px0]]! \n\t"
00782 "vld2.16 {d8-d11}, [%[px1]]! \n\t"
00783 "vld2.16 {d12-d15}, [%[px2]]! \n\t"
00784 "vld2.16 {d16-d19}, [%[px3]]! \n\t"
00785
00786
00787
00788
00789
00790
00791
00792
00793
00795 "vpadd.u16 d4, d4, d5 \n\t"
00796 "vpadd.u16 d5, d6, d7 \n\t"
00797 "vpadd.u16 d6, d8, d9 \n\t"
00798 "vpadd.u16 d7, d10, d11 \n\t"
00799 "vpadd.u16 d8, d12, d13 \n\t"
00800 "vpadd.u16 d9, d14, d15 \n\t"
00801 "vpadd.u16 d10, d16, d17 \n\t"
00802 "vpadd.u16 d11, d18, d19 \n\t"
00803
00804
00805
00806
00807
00809 "vadd.u16 d7, d8 \n\t"
00810 "vadd.u16 d4, d11 \n\t"
00811 "vhadd.u16 d4, d7 \n\t"
00813 "vadd.u16 d5, d9 \n\t"
00815 "vadd.u16 d6, d10 \n\t"
00816
00817
00818
00819
00820
00821
00822
00823
00824
00826
00827 "vmull.s16 q4, d5, d0[0] \n\t"
00828 "vmlal.s16 q4, d4, d0[1] \n\t"
00829 "vmlal.s16 q4, d6, d0[2] \n\t"
00830 "vmlal.s16 q4, d20, d0[3] \n\t"
00831
00832 "vmull.s16 q5, d5, d1[0] \n\t"
00833 "vmlal.s16 q5, d4, d1[1] \n\t"
00834 "vmlal.s16 q5, d6, d1[2] \n\t"
00835 "vmlal.s16 q5, d20, d1[3] \n\t"
00836
00837 "vmull.s16 q6, d5, d2[0] \n\t"
00838 "vmlal.s16 q6, d4, d2[1] \n\t"
00839 "vmlal.s16 q6, d6, d2[2] \n\t"
00840 "vmlal.s16 q6, d20, d2[3] \n\t"
00841
00842
00843
00844
00846 "vrshrn.s32 d3, q4, #10 \n\t"
00847 "vrshrn.s32 d4, q5, #10 \n\t"
00848 "vrshrn.s32 d5, q6, #10 \n\t"
00850 "vmin.s16 d3, d3, d21 \n\t"
00851 "vmin.s16 d4, d4, d21 \n\t"
00852 "vmin.s16 d5, d5, d21 \n\t"
00853 "vmax.s16 d3, d3, d22 \n\t"
00854 "vmax.s16 d4, d4, d22 \n\t"
00855 "vmax.s16 d5, d5, d22 \n\t"
00856
00857
00858
00859
00861 "vmov r0,r1, d3 \n\t"
00862
00863
00864 "uxth r2, r0 \n\t"
00865 "ldrb r4, [%[gammaTable], r2] \n\t"
00866
00867 "uxth r2, r0, ROR #16 \n\t"
00868 "ldrb r3, [%[gammaTable], r2] \n\t"
00869 "orr r4, r4, r3, LSL #24 \n\t"
00870
00871 "uxth r2, r1 \n\t"
00872 "ldrb r3, [%[gammaTable], r2] \n\t"
00873 "mov r5, r3, LSL #16 \n\t"
00874
00875 "uxth r2, r1, ROR #16 \n\t"
00876 "ldrb r3, [%[gammaTable], r2] \n\t"
00877 "mov r6, r3, LSL #8 \n\t"
00878
00879
00880
00881
00882 "vmov r0,r1, d4 \n\t"
00883
00884
00885 "uxth r2, r0 \n\t"
00886 "ldrb r3, [%[gammaTable], r2] \n\t"
00887 "orr r4, r4, r3, LSL #8 \n\t"
00888
00889 "uxth r2, r0, ROR #16 \n\t"
00890 "ldrb r3, [%[gammaTable], r2] \n\t"
00891 "orr r5, r5, r3 \n\t"
00892
00893 "uxth r2, r1 \n\t"
00894 "ldrb r3, [%[gammaTable], r2] \n\t"
00895 "orr r5, r5, r3, LSL #24 \n\t"
00896
00897 "uxth r2, r1, ROR #16 \n\t"
00898 "ldrb r3, [%[gammaTable], r2] \n\t"
00899 "orr r6, r6, r3, LSL #16 \n\t"
00900
00901
00902
00903
00904 "vmov r0,r1, d5 \n\t"
00905
00906
00907 "uxth r2, r0 \n\t"
00908 "ldrb r3, [%[gammaTable], r2] \n\t"
00909 "orr r4, r4, r3, LSL #16 \n\t"
00910
00911 "uxth r2, r0, ROR #16 \n\t"
00912 "ldrb r3, [%[gammaTable], r2] \n\t"
00913 "orr r5, r5, r3, LSL #8 \n\t"
00914
00915 "uxth r2, r1 \n\t"
00916 "ldrb r3, [%[gammaTable], r2] \n\t"
00917 "orr r6, r6, r3 \n\t"
00918
00919 "uxth r2, r1, ROR #16 \n\t"
00920 "ldrb r3, [%[gammaTable], r2] \n\t"
00921 "orr r6, r6, r3, LSL #24 \n\t"
00922
00923
00924
00925
00926 "stm %[dst]!, {r4,r5,r6} \n\t"
00927 : [px0] "+&r" (px0),
00928 [px1] "+&r" (px1),
00929 [px2] "+&r" (px2),
00930 [px3] "+&r" (px3),
00931 [dst] "+&r" (dst)
00932 : [gammaTable] "r" (lut),
00933 [colorMatrix0] "w" (colorMatrix0),
00934 [colorMatrix1] "w" (colorMatrix1),
00935 [colorMatrix2] "w" (colorMatrix2),
00936 [wCoord] "w" (wCoord),
00937 [maxValue] "w" (maxValue),
00938 [minValue] "w" (minValue)
00939 : "memory",
00940 "r0", "r1", "r2", "r3", "r4", "r5", "r6",
00941 "d3", "d4", "d5", "d6",
00942 "d7", "d8", "d9", "d10",
00943 "d11", "d12", "d13", "d14",
00944 "d15", "d16", "d17", "d18", "d19"
00945 );
00946
00947 }
00948 }
00949
00950
00951
00952 return thumb;
00953 }
00954 }
00955
00956
00957 #endif