• Main Page
  • Related Pages
  • Namespaces
  • Classes
  • Files
  • File List
  • File Members

src/processing/Demosaic_ARM.cpp

00001 #ifdef FCAM_ARCH_ARM
00002 #include "Demosaic_ARM.h"
00003 #include <arm_neon.h>
00004 
00005 namespace FCam {
00006 
00007     // Make a linear luminance -> pixel value lookup table
00008     extern void makeLUT(const Frame &f, float contrast, int blackLevel, float gamma, unsigned char *lut);
00009 
00010     Image demosaic_ARM(Frame src, float contrast, bool denoise, int blackLevel, float gamma) {
00011 
00012         const int BLOCK_WIDTH  = 40;
00013         const int BLOCK_HEIGHT = 24;
00014 
00015         Image input = src.image();
00016 
00017         // Check we're the right bayer pattern. If not crop and continue.
00018         switch((int)src.platform().bayerPattern()) {
00019         case GRBG:
00020             break;
00021         case RGGB:
00022             input = input.subImage(1, 0, Size(input.width()-2, input.height()));
00023             break;
00024         case BGGR:
00025             input = input.subImage(0, 1, Size(input.width(), input.height()-2));
00026             break;
00027         case GBRG:
00028             input = input.subImage(1, 1, Size(input.width()-2, input.height()-2));
00029         default:
00030             error(Event::DemosaicError, "Can't demosaic from a non-bayer sensor\n");
00031             return Image();
00032         }       
00033 
00034         int rawWidth = input.width();
00035         int rawHeight = input.height();
00036 
00037         const int VEC_WIDTH = ((BLOCK_WIDTH + 8)/8);
00038         const int VEC_HEIGHT = ((BLOCK_HEIGHT + 8)/2);       
00039 
00040         int rawPixelsPerRow = input.bytesPerRow()/2 ; // Assumes bytesPerRow is even
00041 
00042         int outWidth = rawWidth-8;
00043         int outHeight = rawHeight-8;
00044         outWidth /= BLOCK_WIDTH;
00045         outWidth *= BLOCK_WIDTH;
00046         outHeight /= BLOCK_HEIGHT;
00047         outHeight *= BLOCK_HEIGHT;
00048 
00049         Image out(outWidth, outHeight, RGB24);
00050                 
00051         // Check we're the right size, if not, crop center
00052         if (((input.width() - 8) != (unsigned)outWidth) ||
00053             ((input.height() - 8) != (unsigned)outHeight)) { 
00054             int offX = (input.width() - 8 - outWidth)/2;
00055             int offY = (input.height() - 8 - outHeight)/2;
00056             offX -= offX&1;
00057             offY -= offY&1;
00058             
00059             if (offX || offY) {
00060                 input = input.subImage(offX, offY, Size(outWidth+8, outHeight+8));
00061             }
00062         }           
00063         
00064         Time startTime = Time::now(); 
00065 
00066         // Prepare the color matrix in S8.8 fixed point
00067         float colorMatrix_f[12];
00068         
00069         // Check if there's a custom color matrix
00070         if (src.shot().colorMatrix().size() == 12) {
00071             for (int i = 0; i < 12; i++) {
00072                 colorMatrix_f[i] = src.shot().colorMatrix()[i];
00073             }
00074         } else {
00075             // Otherwise use the platform version
00076             src.platform().rawToRGBColorMatrix(src.shot().whiteBalance, colorMatrix_f);
00077         }
00078 
00079         int16x4_t colorMatrix[3];
00080         for (int i = 0; i < 3; i++) {
00081             int16_t val = (int16_t)(colorMatrix_f[i*4+0] * 256 + 0.5);
00082             colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 0);
00083             val = (int16_t)(colorMatrix_f[i*4+1] * 256 + 0.5);
00084             colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 1);
00085             val = (int16_t)(colorMatrix_f[i*4+2] * 256 + 0.5);
00086             colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 2);
00087             val = (int16_t)(colorMatrix_f[i*4+3] * 256 + 0.5);
00088             colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 3);
00089         }
00090 
00091         // A buffer to store data after demosiac and color correction
00092         // but before gamma correction
00093         uint16_t out16[BLOCK_WIDTH*BLOCK_HEIGHT*3];
00094 
00095         // Various color channels. Only 4 of them are defined before
00096         // demosaic, all of them are defined after demosiac
00097         int16_t scratch[VEC_WIDTH*VEC_HEIGHT*4*12];
00098 
00099         #define R_R_OFF  (VEC_WIDTH*VEC_HEIGHT*4*0)
00100         #define R_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*1)
00101         #define R_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*2)
00102         #define R_B_OFF  (VEC_WIDTH*VEC_HEIGHT*4*3)
00103 
00104         #define G_R_OFF  (VEC_WIDTH*VEC_HEIGHT*4*4)
00105         #define G_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*5)
00106         #define G_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*6)
00107         #define G_B_OFF  (VEC_WIDTH*VEC_HEIGHT*4*7)
00108 
00109         #define B_R_OFF  (VEC_WIDTH*VEC_HEIGHT*4*8)
00110         #define B_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*9)
00111         #define B_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*10)
00112         #define B_B_OFF  (VEC_WIDTH*VEC_HEIGHT*4*11)
00113 
00114         #define R_R(i)  (scratch+(i)+R_R_OFF)
00115         #define R_GR(i) (scratch+(i)+R_GR_OFF)
00116         #define R_GB(i) (scratch+(i)+R_GB_OFF)
00117         #define R_B(i)  (scratch+(i)+R_B_OFF)
00118 
00119         #define G_R(i)  (scratch+(i)+G_R_OFF)
00120         #define G_GR(i) (scratch+(i)+G_GR_OFF)
00121         #define G_GB(i) (scratch+(i)+G_GB_OFF)
00122         #define G_B(i)  (scratch+(i)+G_B_OFF)
00123 
00124         #define B_R(i)  (scratch+(i)+B_R_OFF)
00125         #define B_GR(i) (scratch+(i)+B_GR_OFF)
00126         #define B_GB(i) (scratch+(i)+B_GB_OFF)
00127         #define B_B(i)  (scratch+(i)+B_B_OFF)
00128 
00129         // Reuse some of the output scratch area for the noisy inputs
00130         #define G_GR_NOISY B_GR
00131         #define B_B_NOISY  G_B
00132         #define R_R_NOISY  G_R
00133         #define G_GB_NOISY B_GB
00134 
00135         // Prepare the lookup table
00136         unsigned char lut[4096];
00137         makeLUT(src, contrast, blackLevel, gamma, lut);
00138 
00139         // For each block in the input
00140         for (int by = 0; by < rawHeight-8-BLOCK_HEIGHT+1; by += BLOCK_HEIGHT) {
00141             const short * __restrict__ blockPtr = (const short *)input(0,by);
00142             unsigned char * __restrict__ outBlockPtr = out(0, by);
00143             for (int bx = 0; bx < rawWidth-8-BLOCK_WIDTH+1; bx += BLOCK_WIDTH) {                
00144 
00145                 // Stage 1) Demux a block of input into L1
00146                 if (1) {
00147                     register const int16_t * __restrict__ rawPtr = blockPtr;
00148                     register const int16_t * __restrict__ rawPtr2 = blockPtr + rawPixelsPerRow;
00149 
00150                     register const int rawJump = rawPixelsPerRow*2 - VEC_WIDTH*8;
00151 
00152                     register int16_t * __restrict__ g_gr_ptr = denoise ? G_GR_NOISY(0) : G_GR(0);
00153                     register int16_t * __restrict__ r_r_ptr  = denoise ? R_R_NOISY(0)  : R_R(0);
00154                     register int16_t * __restrict__ b_b_ptr  = denoise ? B_B_NOISY(0)  : B_B(0);
00155                     register int16_t * __restrict__ g_gb_ptr = denoise ? G_GB_NOISY(0) : G_GB(0);
00156 
00157                     for (int y = 0; y < VEC_HEIGHT; y++) {
00158                         for (int x = 0; x < VEC_WIDTH/2; x++) {
00159 
00160                             asm volatile ("# Stage 1) Demux\n");
00161                             
00162                             // The below needs to be volatile, but
00163                             // it's not clear why (if it's not, it
00164                             // gets optimized out entirely)
00165                             asm volatile (
00166                                  "vld2.16  {d6-d9}, [%[rawPtr]]! \n\t"
00167                                  "vld2.16  {d10-d13}, [%[rawPtr2]]! \n\t"
00168                                  "vst1.16  {d6-d7}, [%[g_gr_ptr]]! \n\t"
00169                                  "vst1.16  {d8-d9}, [%[r_r_ptr]]! \n\t"
00170                                  "vst1.16  {d10-d11}, [%[b_b_ptr]]! \n\t"
00171                                  "vst1.16  {d12-d13}, [%[g_gb_ptr]]! \n\t" :
00172                                  [rawPtr]"+r"(rawPtr), 
00173                                  [rawPtr2]"+r"(rawPtr2),
00174                                  [g_gr_ptr]"+r"(g_gr_ptr),
00175                                  [r_r_ptr]"+r"(r_r_ptr),
00176                                  [b_b_ptr]"+r"(b_b_ptr),
00177                                  [g_gb_ptr]"+r"(g_gb_ptr) ::
00178                                  "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "memory");
00179                             
00180                         }
00181 
00182                         rawPtr += rawJump;
00183                         rawPtr2 += rawJump;
00184                     }               
00185                 }
00186 
00187                 // Stage 1.5) Denoise sensor input (noisy pixel supression)
00188 
00189                 // A pixel can't be brighter than its brightest neighbor
00190 
00191                 if (denoise) {
00192                     register int16_t * __restrict__ ptr_in = NULL;
00193                     register int16_t * __restrict__ ptr_out = NULL;
00194                     asm volatile("#Stage 1.5: Denoise\n\t");
00195                     for (int b=0; b<4; b++) {
00196                         if (b==0) ptr_in = G_GR_NOISY(0);
00197                         if (b==1) ptr_in = R_R_NOISY(0);
00198                         if (b==2) ptr_in = B_B_NOISY(0);
00199                         if (b==3) ptr_in = G_GB_NOISY(0);
00200                         if (b==0) ptr_out = G_GR(0);
00201                         if (b==1) ptr_out = R_R(0);
00202                         if (b==2) ptr_out = B_B(0);
00203                         if (b==3) ptr_out = G_GB(0);
00204 
00205                         // write the top block pixels who aren't being denoised
00206                         for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00207                             int16x8_t in = vld1q_s16(ptr_in);
00208                             vst1q_s16(ptr_out, in);
00209                             ptr_in+=8;
00210                             ptr_out+=8;
00211                         }
00212 
00213                         for (int y = 1; y < VEC_HEIGHT - 1; y++) {
00214                             for (int x = 0; x < VEC_WIDTH/2; x++) {
00215                                 int16x8_t here  = vld1q_s16(ptr_in);
00216                                 int16x8_t above = vld1q_s16(ptr_in + VEC_WIDTH*4);
00217                                 int16x8_t under = vld1q_s16(ptr_in - VEC_WIDTH*4);
00218                                 int16x8_t right = vld1q_s16(ptr_in + 1);
00219                                 int16x8_t left  = vld1q_s16(ptr_in - 1);
00220                                 int16x8_t max, min;
00221 
00222                                 // find the max and min of the neighbors
00223                                 max = vmaxq_s16(left, right);
00224                                 max = vmaxq_s16(above, max);
00225                                 max = vmaxq_s16(under, max);
00226 
00227                                 min = vminq_s16(left, right);
00228                                 min = vminq_s16(above, min);
00229                                 min = vminq_s16(under, min);                               
00230 
00231                                 // clamp here to be within this range
00232                                 here  = vminq_s16(max, here);
00233                                 here  = vmaxq_s16(min, here);
00234 
00235                                 vst1q_s16(ptr_out, here);
00236                                 ptr_in += 8;
00237                                 ptr_out += 8;
00238                             }
00239                         }
00240 
00241                         // write the bottom block pixels who aren't being denoised
00242                         for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00243                             int16x8_t in = vld1q_s16(ptr_in);
00244                             vst1q_s16(ptr_out, in);
00245                             ptr_in+=8;
00246                             ptr_out+=8;
00247                         }
00248                     }
00249                 }
00250 
00251                 // Stage 2 and 3) Do horizontal and vertical
00252                 // interpolation of green, as well as picking the
00253                 // output for green
00254                 /*
00255                   gv_r = (gb[UP] + gb[HERE])/2;
00256                   gvd_r = (gb[UP] - gb[HERE]);
00257                   gh_r = (gr[HERE] + gr[RIGHT])/2;
00258                   ghd_r = (gr[HERE] - gr[RIGHT]);                 
00259                   g_r = ghd_r < gvd_r ? gh_r : gv_r;
00260                   
00261                   gv_b = (gr[DOWN] + gr[HERE])/2;
00262                   gvd_b = (gr[DOWN] - gr[HERE]);                  
00263                   gh_b = (gb[LEFT] + gb[HERE])/2;
00264                   ghd_b = (gb[LEFT] - gb[HERE]);                  
00265                   g_b = ghd_b < gvd_b ? gh_b : gv_b;
00266                 */
00267                 if (1) {
00268                 
00269                     int i = VEC_WIDTH*4;
00270 
00271                     register int16_t *g_gb_up_ptr = G_GB(i) - VEC_WIDTH*4;
00272                     register int16_t *g_gb_here_ptr = G_GB(i);
00273                     register int16_t *g_gb_left_ptr = G_GB(i) - 1;
00274                     register int16_t *g_gr_down_ptr = G_GR(i) + VEC_WIDTH*4;
00275                     register int16_t *g_gr_here_ptr = G_GR(i);
00276                     register int16_t *g_gr_right_ptr = G_GR(i) + 1;
00277                     register int16_t *g_r_ptr = G_R(i);
00278                     register int16_t *g_b_ptr = G_B(i);
00279             
00280                     for (int y = 1; y < VEC_HEIGHT-1; y++) {
00281                         for (int x = 0; x < VEC_WIDTH/2; x++) {
00282 
00283                             asm volatile ("#Stage 2) Green interpolation\n");
00284 
00285                             // Load the inputs
00286 
00287                             int16x8_t gb_up = vld1q_s16(g_gb_up_ptr);
00288                             g_gb_up_ptr+=8;
00289                             int16x8_t gb_here = vld1q_s16(g_gb_here_ptr);
00290                             g_gb_here_ptr+=8;
00291                             int16x8_t gb_left = vld1q_s16(g_gb_left_ptr); // unaligned
00292                             g_gb_left_ptr+=8;
00293                             int16x8_t gr_down = vld1q_s16(g_gr_down_ptr);
00294                             g_gr_down_ptr+=8;
00295                             int16x8_t gr_here = vld1q_s16(g_gr_here_ptr);
00296                             g_gr_here_ptr+=8;
00297                             int16x8_t gr_right = vld1q_s16(g_gr_right_ptr); // unaligned
00298                             g_gr_right_ptr+=8;
00299                             
00300                             //I couldn't get this assembly to work, and I don't know which
00301                             // of the three blocks of assembly is wrong
00302                             // This asm should load the inputs
00303                             /*
00304                             asm volatile(
00305                             "vld1.16        {d16-d17}, [%[gb_up]]!\n\t"
00306                             "vld1.16        {d18-d19}, [%[gb_here]]!\n\t"
00307                             "vld1.16        {d20-d21}, [%[gb_left]]!\n\t"
00308                             "vld1.16        {d22-d23}, [%[gr_down]]!\n\t"
00309                             "vld1.16        {d24-d25}, [%[gr_here]]!\n\t"
00310                             "vld1.16        {d26-d27}, [%[gr_right]]!\n\t"
00311                             :
00312                             [gb_up]"+r"(g_gb_up_ptr),
00313                             [gb_here]"+r"(g_gb_here_ptr),
00314                             [gb_left]"+r"(g_gb_left_ptr),
00315                             [gr_down]"+r"(g_gr_down_ptr),
00316                             [gr_here]"+r"(g_gr_here_ptr),
00317                             [gr_right]"+r"(g_gr_right_ptr) :: 
00318                             //"d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27",
00319                             "q8","q9","q10","q11","q12","q13");
00320 
00321                             //q8 - gb_up
00322                             //q9 - gb_here
00323                             //q10 - gb_left
00324                             //q11 - gr_down
00325                             //q12 - gr_here
00326                             //q13 - gr_right
00327                             */
00328 
00329                             // Do the processing
00330                             int16x8_t gv_r  = vhaddq_s16(gb_up, gb_here);
00331                             int16x8_t gvd_r = vabdq_s16(gb_up, gb_here);
00332                             int16x8_t gh_r  = vhaddq_s16(gr_right, gr_here);
00333                             int16x8_t ghd_r = vabdq_s16(gr_here, gr_right);
00334                             int16x8_t g_r = vbslq_s16(vcltq_s16(ghd_r, gvd_r), gh_r, gv_r);
00335 
00336                             int16x8_t gv_b  = vhaddq_s16(gr_down, gr_here);
00337                             int16x8_t gvd_b = vabdq_s16(gr_down, gr_here);
00338                             int16x8_t gh_b  = vhaddq_s16(gb_left, gb_here);
00339                             int16x8_t ghd_b = vabdq_s16(gb_left, gb_here);
00340                             int16x8_t g_b = vbslq_s16(vcltq_s16(ghd_b, gvd_b), gh_b, gv_b);
00341                             
00342                             //this asm should do the above selection/interpolation
00343                             /*
00344                             asm volatile(
00345                             "vabd.s16       q0, q12, q13\n\t" //ghd_r
00346                             "vabd.s16       q1, q8, q9\n\t" //gvd_r
00347                             "vabd.s16       q2, q10, q9\n\t" //ghd_b
00348                             "vabd.s16       q3, q11, q12\n\t" //gvd_b
00349                             "vcgt.s16       q1, q0, q1\n\t" //select ghd_r or gvd_r
00350                             "vcgt.s16       q2, q2, q3\n\t" //select gvd_b or ghd_b
00351                             "vhadd.s16      q8, q8, q9\n\t" //gv_r
00352                             "vhadd.s16      q11, q11, q12\n\t" //gv_b
00353                             "vhadd.s16      q12, q12, q13\n\t" //gh_r
00354                             "vhadd.s16      q9, q9, q10\n\t" //gh_b
00355                             "vbsl           q1, q12, q8\n\t" //g_r
00356                             "vbsl           q2, q9, q11\n\t" //g_b
00357                              ::: "q0","q1","q2","q3","q8","q9","q10","q11","q12","q13");
00358                             */
00359 
00360                             //this should save the output
00361                             /*
00362                             asm volatile(
00363                             "vst1.16        {d2-d3}, [%[g_r]]!\n\t"
00364                             "vst1.16        {d4-d5}, [%[g_b]]!\n\t" :
00365                             [g_r]"+r"(g_r_ptr),[g_b]"+r"(g_b_ptr)
00366                             :: "memory");
00367                             */
00368                             
00369                             // Save the outputs
00370                             vst1q_s16(g_r_ptr, g_r);
00371                             g_r_ptr+=8;
00372                             vst1q_s16(g_b_ptr, g_b);
00373                             g_b_ptr+=8;
00374                         }
00375                     }
00376                 }
00377                 asm volatile ("#End of stage 2 (green interpolation)\n");
00378                 // Stages 4-9
00379 
00380                 if (1) {
00381                     
00382                     /*
00383                       r_gr = (r[LEFT] + r[HERE])/2 + gr[HERE] - (g_r[LEFT] + g_r[HERE])/2;
00384                       b_gr = (b[UP] + b[HERE])/2 + gr[HERE] - (g_b[UP] + g_b[HERE])/2;
00385                       r_gb = (r[HERE] + r[DOWN])/2 + gb[HERE] - (g_r[HERE] + g_r[DOWN])/2;
00386                       b_gb = (b[HERE] + b[RIGHT])/2 + gb[HERE] - (g_b[HERE] + g_b[RIGHT])/2;
00387                       
00388                       rp_b = (r[DOWNLEFT] + r[HERE])/2 + g_b[HERE] - (g_r[DOWNLEFT] + g_r[HERE])/2;
00389                       rn_b = (r[LEFT] + r[DOWN])/2 + g_b[HERE] - (g_r[LEFT] + g_r[DOWN])/2;
00390                       rpd_b = (r[DOWNLEFT] - r[HERE]);
00391                       rnd_b = (r[LEFT] - r[DOWN]);      
00392                       r_b = rpd_b < rnd_b ? rp_b : rn_b;                      
00393         
00394                       bp_r = (b[UPRIGHT] + b[HERE])/2 + g_r[HERE] - (g_b[UPRIGHT] + g_b[HERE])/2;
00395                       bn_r = (b[RIGHT] + b[UP])/2 + g_r[HERE] - (g_b[RIGHT] + g_b[UP])/2;       
00396                       bpd_r = (b[UPRIGHT] - b[HERE]);
00397                       bnd_r = (b[RIGHT] - b[UP]);       
00398                       b_r = bpd_r < bnd_r ? bp_r : bn_r;
00399                     */
00400 
00401                     int i = 2*VEC_WIDTH*4;
00402 
00403                     for (int y = 2; y < VEC_HEIGHT-2; y++) {
00404                         for (int x = 0; x < VEC_WIDTH; x++) {
00405 
00406                             asm volatile ("#Stage 4) r/b interpolation\n");
00407 
00408                             // Load the inputs
00409                             int16x4_t r_here       = vld1_s16(R_R(i));
00410                             int16x4_t r_left       = vld1_s16(R_R(i) - 1);
00411                             int16x4_t r_down       = vld1_s16(R_R(i) + VEC_WIDTH*4);
00412 
00413                             int16x4_t g_r_left     = vld1_s16(G_R(i) - 1);
00414                             int16x4_t g_r_here     = vld1_s16(G_R(i));
00415                             int16x4_t g_r_down     = vld1_s16(G_R(i) + VEC_WIDTH*4);
00416 
00417                             int16x4_t b_up         = vld1_s16(B_B(i) - VEC_WIDTH*4);
00418                             int16x4_t b_here       = vld1_s16(B_B(i));
00419                             int16x4_t b_right      = vld1_s16(B_B(i) + 1);
00420 
00421                             int16x4_t g_b_up       = vld1_s16(G_B(i) - VEC_WIDTH*4);
00422                             int16x4_t g_b_here     = vld1_s16(G_B(i));
00423                             int16x4_t g_b_right    = vld1_s16(G_B(i) + 1);
00424 
00425                             // Do the processing
00426                             int16x4_t gr_here      = vld1_s16(G_GR(i));
00427                             int16x4_t gb_here      = vld1_s16(G_GB(i));
00428 
00429                             { // red at green
00430                                 int16x4_t r_gr  = vadd_s16(vhadd_s16(r_left, r_here),
00431                                                             vsub_s16(gr_here,
00432                                                                       vhadd_s16(g_r_left, g_r_here)));
00433                                 int16x4_t r_gb  = vadd_s16(vhadd_s16(r_here, r_down),
00434                                                             vsub_s16(gb_here, 
00435                                                                       vhadd_s16(g_r_down, g_r_here)));
00436                                 vst1_s16(R_GR(i), r_gr);
00437                                 vst1_s16(R_GB(i), r_gb);
00438                             }
00439                             
00440                             { // red at blue
00441                                 int16x4_t r_downleft   = vld1_s16(R_R(i) + VEC_WIDTH*4 - 1);
00442                                 int16x4_t g_r_downleft = vld1_s16(G_R(i) + VEC_WIDTH*4 - 1);
00443                                 
00444                                 int16x4_t rp_b  = vadd_s16(vhadd_s16(r_downleft, r_here),
00445                                                             vsub_s16(g_b_here,
00446                                                                      vhadd_s16(g_r_downleft, g_r_here)));
00447                                 int16x4_t rn_b  = vadd_s16(vhadd_s16(r_left, r_down),
00448                                                             vsub_s16(g_b_here,
00449                                                                      vhadd_s16(g_r_left, g_r_down)));
00450                                 int16x4_t rpd_b = vabd_s16(r_downleft, r_here);
00451                                 int16x4_t rnd_b = vabd_s16(r_left, r_down);
00452                                 int16x4_t r_b   = vbsl_s16(vclt_s16(rpd_b, rnd_b), rp_b, rn_b);
00453                                 vst1_s16(R_B(i), r_b);
00454                             }
00455                             
00456                             { // blue at green
00457                                 int16x4_t b_gr  = vadd_s16(vhadd_s16(b_up, b_here),
00458                                                             vsub_s16(gr_here,
00459                                                                      vhadd_s16(g_b_up, g_b_here)));
00460                                 int16x4_t b_gb  = vadd_s16(vhadd_s16(b_here, b_right),
00461                                                             vsub_s16(gb_here,
00462                                                                      vhadd_s16(g_b_right, g_b_here)));
00463                                 vst1_s16(B_GR(i), b_gr);
00464                                 vst1_s16(B_GB(i), b_gb);
00465                             }
00466                             
00467                             { // blue at red
00468                                 int16x4_t b_upright    = vld1_s16(B_B(i) - VEC_WIDTH*4 + 1);
00469                                 int16x4_t g_b_upright  = vld1_s16(G_B(i) - VEC_WIDTH*4 + 1);
00470                                 
00471                                 int16x4_t bp_r  = vadd_s16(vhadd_s16(b_upright, b_here),
00472                                                             vsub_s16(g_r_here, 
00473                                                                       vhadd_s16(g_b_upright, g_b_here)));
00474                                 int16x4_t bn_r  = vadd_s16(vhadd_s16(b_right, b_up),
00475                                                             vsub_s16(g_r_here,
00476                                                                       vhadd_s16(g_b_right, g_b_up)));
00477                                 int16x4_t bpd_r = vabd_s16(b_upright, b_here);
00478                                 int16x4_t bnd_r = vabd_s16(b_right, b_up);
00479                                 int16x4_t b_r   = vbsl_s16(vclt_s16(bpd_r, bnd_r), bp_r, bn_r);
00480                                 vst1_s16(B_R(i), b_r);
00481                             }
00482                             
00483                             // Advance the index
00484                             i += 4;
00485                         }
00486                     }
00487                     asm volatile ("#End of stage 4 - what_ever\n\t");
00488                 }
00489 
00490                 // Stage 10)
00491                 if (1) {
00492                     // Color-correct and save the results into a 16-bit buffer for gamma correction
00493 
00494                     asm volatile ("#Stage 10) Color Correction\n");
00495 
00496                     uint16_t * __restrict__ out16Ptr = out16;
00497 
00498                     int i = 2*VEC_WIDTH*4;
00499 
00500                     const uint16x4_t bound = vdup_n_u16(1023);
00501 
00502                     for (int y = 2; y < VEC_HEIGHT-2; y++) {
00503 
00504                         // skip the first vec in each row
00505                         
00506                         int16x4x2_t r0 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00507                         int16x4x2_t g0 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00508                         int16x4x2_t b0 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00509                         i += 4;
00510 
00511                         for (int x = 1; x < VEC_WIDTH; x++) {
00512                             
00513                             int16x4x2_t r1 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00514                             int16x4x2_t g1 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00515                             int16x4x2_t b1 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00516                             
00517                             // do the color matrix
00518                             int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));                       
00519                             rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00520                             rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00521                             rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00522                             
00523                             int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));                       
00524                             gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00525                             gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00526                             gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00527                             
00528                             int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00529                             bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00530                             bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00531                             bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00532                             
00533                             uint16x4x3_t col16;
00534                             col16.val[0] = vqrshrun_n_s32(rout, 8);
00535                             col16.val[1] = vqrshrun_n_s32(gout, 8);
00536                             col16.val[2] = vqrshrun_n_s32(bout, 8);                     
00537                             col16.val[0] = vmin_u16(col16.val[0], bound);
00538                             col16.val[1] = vmin_u16(col16.val[1], bound);
00539                             col16.val[2] = vmin_u16(col16.val[2], bound);
00540                             vst3_u16(out16Ptr, col16);
00541                             out16Ptr += 12;
00542                             
00543                             rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));                 
00544                             rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00545                             rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00546                             rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00547                             
00548                             gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));                 
00549                             gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00550                             gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00551                             gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00552                             
00553                             bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00554                             bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00555                             bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00556                             bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00557                             
00558                             col16.val[0] = vqrshrun_n_s32(rout, 8);
00559                             col16.val[1] = vqrshrun_n_s32(gout, 8);
00560                             col16.val[2] = vqrshrun_n_s32(bout, 8);                     
00561                             col16.val[0] = vmin_u16(col16.val[0], bound);
00562                             col16.val[1] = vmin_u16(col16.val[1], bound);
00563                             col16.val[2] = vmin_u16(col16.val[2], bound);
00564                             vst3_u16(out16Ptr, col16);
00565                             out16Ptr += 12;
00566                             
00567                             r0 = r1;
00568                             g0 = g1;
00569                             b0 = b1;
00570 
00571                             i += 4;
00572                         }
00573 
00574                         // jump back
00575                         i -= VEC_WIDTH*4;
00576 
00577                         r0 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00578                         g0 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00579                         b0 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));                     
00580                         i += 4;
00581 
00582                         for (int x = 1; x < VEC_WIDTH; x++) {
00583                             int16x4x2_t r1 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00584                             int16x4x2_t g1 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00585                             int16x4x2_t b1 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));
00586                             
00587                             // do the color matrix
00588                             int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));                       
00589                             rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00590                             rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00591                             rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00592                             
00593                             int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));                       
00594                             gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00595                             gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00596                             gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00597                             
00598                             int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00599                             bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00600                             bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00601                             bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00602                             
00603                             uint16x4x3_t col16;
00604                             col16.val[0] = vqrshrun_n_s32(rout, 8);
00605                             col16.val[1] = vqrshrun_n_s32(gout, 8);
00606                             col16.val[2] = vqrshrun_n_s32(bout, 8);                     
00607                             col16.val[0] = vmin_u16(col16.val[0], bound);
00608                             col16.val[1] = vmin_u16(col16.val[1], bound);
00609                             col16.val[2] = vmin_u16(col16.val[2], bound);
00610                             vst3_u16(out16Ptr, col16);
00611                             out16Ptr += 12;
00612                             
00613                             rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));                 
00614                             rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00615                             rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00616                             rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00617                             
00618                             gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));                 
00619                             gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00620                             gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00621                             gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00622                             
00623                             bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00624                             bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00625                             bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00626                             bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00627                             
00628                             col16.val[0] = vqrshrun_n_s32(rout, 8);
00629                             col16.val[1] = vqrshrun_n_s32(gout, 8);
00630                             col16.val[2] = vqrshrun_n_s32(bout, 8);                     
00631                             col16.val[0] = vmin_u16(col16.val[0], bound);
00632                             col16.val[1] = vmin_u16(col16.val[1], bound);
00633                             col16.val[2] = vmin_u16(col16.val[2], bound);
00634                             vst3_u16(out16Ptr, col16);
00635                             out16Ptr += 12;
00636                             
00637                             r0 = r1;
00638                             g0 = g1;
00639                             b0 = b1;
00640 
00641                             i += 4;
00642                         }
00643                     }   
00644                     asm volatile("#End of stage 10) - color correction\n\t");
00645                 }
00646                 
00647 
00648                 if (1) {
00649 
00650                     asm volatile("#Gamma Correction\n");                   
00651                     // Gamma correction (on the CPU, not the NEON)
00652                     const uint16_t * __restrict__ out16Ptr = out16;
00653                     
00654                     for (int y = 0; y < BLOCK_HEIGHT; y++) {                    
00655                         unsigned int * __restrict__ outPtr32 = (unsigned int *)(outBlockPtr + y * outWidth * 3);
00656                         for (int x = 0; x < (BLOCK_WIDTH*3)/4; x++) {
00657                             unsigned val = ((lut[out16Ptr[0]] << 0) |
00658                                             (lut[out16Ptr[1]] << 8) | 
00659                                             (lut[out16Ptr[2]] << 16) |
00660                                             (lut[out16Ptr[3]] << 24));
00661                             *outPtr32++ = val;
00662                             out16Ptr += 4;
00663                             // *outPtr++ = lut[*out16Ptr++];
00664                         }
00665                     }           
00666                     asm volatile("#end of Gamma Correction\n");                   
00667                     
00668                     /*
00669                     const uint16_t * __restrict__ out16Ptr = out16;                 
00670                     for (int y = 0; y < BLOCK_HEIGHT; y++) {                    
00671                         unsigned char * __restrict__ outPtr = (outBlockPtr + y * outWidth * 3);
00672                         for (int x = 0; x < (BLOCK_WIDTH*3); x++) {
00673                             *outPtr++ = lut[*out16Ptr++];
00674                         }
00675                     }
00676                     */
00677                     
00678                 }
00679                 
00680 
00681                 blockPtr += BLOCK_WIDTH;
00682                 outBlockPtr += BLOCK_WIDTH*3;
00683             }
00684         }       
00685 
00686         //std::cout << "Done demosaicking. time = " << ((Time::now() - startTime)/1000) << std::endl;
00687         return out;
00688     }
00689 
00690     Image makeThumbnailRAW_ARM(Frame src, float contrast, int blackLevel, float gamma) {
00691         // Assuming we want a slightly-cropped thumbnail into 640x480, Bayer pattern GRBG
00692         // This means averaging together a 4x4 block of Bayer pattern for one RGB24 pixel
00693         // Also want to convert to sRGB, which includes a color matrix multiply and a gamma transform
00694         // using a lookup table.
00695 
00696         // Implementation: 
00697         //   Uses ARM NEON SIMD vector instructions and inline assembly.
00698         //   Reads in a 16x4 block of pixels at a time, in 16-bit GRBG Bayer format, and outputs a 4x1 block of RGB24 pixels.
00699         // Important note: Due to some apparent bugs in GCC's inline assembly register mapping between C variables and NEON registers,
00700         //   namely that trying to reference an int16x4 variable creates a reference to a s register instead of a d register, all the
00701         //   int16x4 variables are forced into specific NEON registers, and then referred to using that register, not by name.  
00702         //   This bug seems to be in gcc 4.2.1, should be fixed by 4.4 based on some gcc bug reports.
00703 
00704         Image thumb(640, 480, RGB24);
00705         const unsigned int w = 2592, tw = 640;
00706         const unsigned int h = 1968, th = 480;
00707         const unsigned int scale = 4;
00708         const unsigned int cw = tw*scale;
00709         const unsigned int ch = th*scale;
00710         const unsigned int startX = (w-cw)/2;
00711         const unsigned int startY = (h-ch)/2;        
00712         const unsigned int bytesPerRow = src.image().bytesPerRow();
00713 
00714         // Make the response curve
00715         unsigned char lut[4096];
00716         makeLUT(src, contrast, blackLevel, gamma, lut);
00717 
00718         unsigned char *row = src.image()(startX, startY);
00719 
00720         Time startTime = Time::now();
00721         float colorMatrix_f[12];
00722 
00723         // Check if there's a custom color matrix
00724         if (src.shot().colorMatrix().size() == 12) {
00725             for (int i = 0; i < 12; i++) {
00726                 colorMatrix_f[i] = src.shot().colorMatrix()[i];
00727             }
00728             printf("Making thumbnail with custom WB\n");
00729         } else {
00730             // Otherwise use the platform version
00731             src.platform().rawToRGBColorMatrix(src.shot().whiteBalance, colorMatrix_f);
00732             printf("Making thumbnail with platform WB\n");
00733         }
00734 
00735         register int16x4_t colorMatrix0 asm ("d0"); // ASM assignments are essential - they're implicitly trusted by the inline code.
00736         register int16x4_t colorMatrix1 asm ("d1");
00737         register int16x4_t colorMatrix2 asm ("d2");
00738         register int16x4_t wCoord asm ("d20"); // Workaround for annoyances with scalar addition.
00739         register int16x4_t maxValue asm ("d21"); // Maximum allowed signed 16-bit value
00740         register int16x4_t minValue asm ("d22"); // Minimum allowed signed 16-bit value
00741 
00742         asm volatile(
00743                     // Load matrix into colorMatrix0-2, set to be d0-d2
00744                     "vldm %[colorMatrix_f], {q2,q3,q4}  \n\t"
00745                     "vcvt.s32.f32 q2, q2, #8  \n\t" // Float->fixed-point conversion
00746                     "vcvt.s32.f32 q3, q3, #8  \n\t"
00747                     "vcvt.s32.f32 q4, q4, #8  \n\t"
00748                     "vmovn.i32 d0, q2  \n\t" // Narrowing to 16-bit
00749                     "vmovn.i32 d1, q3  \n\t"
00750                     "vmovn.i32 d2, q4  \n\t"
00751                     // Load homogenous coordinate, pixel value limits
00752                     "vmov.i16  d20, #0x4   \n\t"  // Homogenous coordinate. 
00753                     "vmov.i16  d21, #0x00FF  \n\t"  // Maximum pixel value: 1023
00754                     "vorr.i16  d21, #0x0300  \n\t"  // Maximum pixel value part 2
00755                     "vmov.i16  d22, #0x0     \n\t"  // Minimum pixel value: 0
00756                     : [colorMatrix0] "=w" (colorMatrix0),
00757                       [colorMatrix1] "=w" (colorMatrix1),
00758                       [colorMatrix2] "=w" (colorMatrix2),
00759                       [wCoord] "=w" (wCoord),
00760                       [maxValue] "=w" (maxValue),
00761                       [minValue] "=w" (minValue)
00762                     :  [colorMatrix_f] "r" (colorMatrix_f)
00763                     : "memory",
00764                       "d3", "d4", "d5", "d6", "d7", "d8", "d9");
00765                 
00766         for (unsigned int ty = 0; ty <480; ty++, row+=4*bytesPerRow) {
00767             register unsigned short *px0 = (unsigned short *)row;
00768             register unsigned short *px1 = (unsigned short *)(row+1*bytesPerRow);
00769             register unsigned short *px2 = (unsigned short *)(row+2*bytesPerRow);
00770             register unsigned short *px3 = (unsigned short *)(row+3*bytesPerRow);
00771 
00772             register unsigned char *dst = thumb(0,ty);
00773             for (register unsigned int tx =0; tx < 640; tx+=scale) {
00774                 // Assembly block for fast downsample/demosaic, color correction, and gamma curve lookup
00775                 asm volatile(
00776                     // *px0: GRGR GRGR GRGR GRGR
00777                     // *px1: BGBG BGBG BGBG BGBG
00778                     // *px2: GRGR GRGR GRGR GRGR
00779                     // *px3: BGBG BGBG BGBG BGBG
00781                     "vld2.16 {d4-d7}, [%[px0]]!  \n\t"
00782                     "vld2.16 {d8-d11}, [%[px1]]! \n\t"
00783                     "vld2.16 {d12-d15}, [%[px2]]! \n\t"
00784                     "vld2.16 {d16-d19}, [%[px3]]! \n\t"
00785                     //  d4    d5    d6    d7
00786                     // GG|GG GG|GG RR|RR RR|RR
00787                     //  d8    d9    d10   d11
00788                     // BB|BB BB|BB GG|GG GG|GG
00789                     //  d12   d13   d14   d15
00790                     // GG|GG GG|GG RR|RR RR|RR
00791                     //  d16   d17   d18   d19
00792                     // BB|BB BB|BB GG|GG GG|GG
00793 
00795                     "vpadd.u16 d4, d4, d5  \n\t"   // G1
00796                     "vpadd.u16 d5, d6, d7  \n\t"   // R1
00797                     "vpadd.u16 d6, d8, d9  \n\t"   // B1
00798                     "vpadd.u16 d7, d10, d11 \n\t"  // G2
00799                     "vpadd.u16 d8, d12, d13 \n\t"  // G3
00800                     "vpadd.u16 d9, d14, d15 \n\t"  // R2
00801                     "vpadd.u16 d10, d16, d17 \n\t" // B2
00802                     "vpadd.u16 d11, d18, d19 \n\t" // G4
00803                     //    d4       d5       d6       d7
00804                     // G|G|G|G  R|R|R|R  B|B|B|B  G|G|G|G
00805                     //    d8       d9       d10      d11
00806                     // G|G|G|G  R|R|R|R  B|B|B|B  G|G|G|G
00807 
00809                     "vadd.u16 d7, d8   \n\t"
00810                     "vadd.u16 d4, d11   \n\t"
00811                     "vhadd.u16 d4, d7  \n\t"
00813                     "vadd.u16 d5, d9  \n\t"
00815                     "vadd.u16 d6, d10 \n\t"
00816                     //    d4       d5       d6  
00817                     // G|G|G|G  R|R|R|R  B|B|B|B
00818                     //
00819                     // Assuming sRGB affine matrix stored in fixed precision (lsb = 1/256)
00820                     // Trusting GCC to properly assign colorMatrix0-2 to d0-d2.  Direct reference seems to be broken on g++ 4.2.1 at least.
00821                     // r   colorMatrix0[0] [1] [2] [3]   r_in
00822                     // g = colorMatrix1[0] [1] [2] [3] * g_in
00823                     // b   colorMatrix2[0] [1] [2] [3]   b_in
00824 
00826 
00827                     "vmull.s16 q4, d5, d0[0] \n\t"
00828                     "vmlal.s16 q4, d4, d0[1] \n\t"
00829                     "vmlal.s16 q4, d6, d0[2] \n\t"
00830                     "vmlal.s16 q4, d20, d0[3] \n\t" 
00831 
00832                     "vmull.s16 q5, d5, d1[0] \n\t"
00833                     "vmlal.s16 q5, d4, d1[1] \n\t"
00834                     "vmlal.s16 q5, d6, d1[2] \n\t"
00835                     "vmlal.s16 q5, d20, d1[3] \n\t"
00836 
00837                     "vmull.s16 q6, d5, d2[0] \n\t"
00838                     "vmlal.s16 q6, d4, d2[1] \n\t"
00839                     "vmlal.s16 q6, d6, d2[2] \n\t"
00840                     "vmlal.s16 q6, d20, d2[3] \n\t"
00841 
00842                     //  d08  d09  d10  d11  d12  d13
00843                     //  R|R  R|R  G|G  G|G  B|B  B|B
00844 
00846                     "vrshrn.s32 d3, q4, #10  \n\t"
00847                     "vrshrn.s32 d4, q5, #10  \n\t"
00848                     "vrshrn.s32 d5, q6, #10  \n\t"
00850                     "vmin.s16 d3, d3, d21    \n\t"
00851                     "vmin.s16 d4, d4, d21    \n\t"
00852                     "vmin.s16 d5, d5, d21    \n\t"
00853                     "vmax.s16 d3, d3, d22    \n\t"
00854                     "vmax.s16 d4, d4, d22    \n\t"
00855                     "vmax.s16 d5, d5, d22    \n\t"
00856 
00857                     //    d3       d4       d2
00858                     // R|R|R|R  G|G|G|G  B|B|B|B
00859                     
00861                     "vmov r0,r1, d3                        \n\t"
00862                     //    r0       r1
00863                     // R16|R16  R16|R16
00864                     "uxth r2, r0                           \n\t" // Extract first red pixel into r2
00865                     "ldrb r4, [%[gammaTable], r2]          \n\t" // Table lookup, byte result
00866 
00867                     "uxth r2, r0, ROR #16                  \n\t"
00868                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00869                     "orr  r4, r4, r3, LSL #24              \n\t"
00870 
00871                     "uxth r2, r1                           \n\t"
00872                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00873                     "mov  r5, r3, LSL #16                  \n\t"
00874 
00875                     "uxth r2, r1, ROR #16                  \n\t"
00876                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00877                     "mov  r6, r3, LSL #8                   \n\t"
00878 
00879                     //   r4   r5   r6  
00880                     //  R__R __R_ _R__  -> increasing mem address (and increasing left shift)
00881 
00882                     "vmov r0,r1, d4                        \n\t"
00883                     //    r0       r1
00884                     // G16|G16  G16|G16
00885                     "uxth r2, r0                           \n\t"
00886                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00887                     "orr  r4, r4, r3, LSL #8               \n\t"
00888 
00889                     "uxth r2, r0, ROR #16                  \n\t"
00890                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00891                     "orr  r5, r5, r3                       \n\t"
00892 
00893                     "uxth r2, r1                           \n\t"
00894                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00895                     "orr  r5, r5, r3, LSL #24              \n\t"
00896 
00897                     "uxth r2, r1, ROR #16                  \n\t"
00898                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00899                     "orr  r6, r6, r3, LSL #16              \n\t"
00900 
00901                     //   r4   r5   r6  
00902                     //  RG_R G_RG _RG_  -> increasing mem address (and increasing left shift)
00903 
00904                     "vmov r0,r1, d5                        \n\t"
00905                     //    r0       r1
00906                     // B16|B16  B16|B16
00907                     "uxth r2, r0                           \n\t"
00908                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00909                     "orr  r4, r4, r3, LSL #16              \n\t"
00910 
00911                     "uxth r2, r0, ROR #16                  \n\t"
00912                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00913                     "orr  r5, r5, r3, LSL #8               \n\t"
00914 
00915                     "uxth r2, r1                           \n\t"
00916                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00917                     "orr  r6, r6, r3                       \n\t"
00918 
00919                     "uxth r2, r1, ROR #16                  \n\t"
00920                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00921                     "orr  r6, r6, r3, LSL #24              \n\t"
00922 
00923                     //   r4   r5   r6  
00924                     //  RGBR GBRG BRGB 
00925 
00926                     "stm %[dst]!, {r4,r5,r6}                   \n\t" // multi-store!
00927                     : [px0] "+&r" (px0),
00928                       [px1] "+&r" (px1),
00929                       [px2] "+&r" (px2),
00930                       [px3] "+&r" (px3),
00931                       [dst] "+&r" (dst)
00932                     : [gammaTable] "r" (lut),
00933                       [colorMatrix0] "w" (colorMatrix0), // Implicitly referenced only (d0)
00934                       [colorMatrix1] "w" (colorMatrix1), // Implicitly referenced only (d1)
00935                       [colorMatrix2] "w" (colorMatrix2), // Implicitly referenced only (d2)
00936                       [wCoord] "w" (wCoord),             // Implicitly referenced only (d20)
00937                       [maxValue] "w" (maxValue),         // Implicitly referenced only (d21)
00938                       [minValue] "w" (minValue)          // Implicitly referenced only (d22)
00939                     : "memory", 
00940                       "r0", "r1", "r2", "r3", "r4", "r5", "r6",
00941                       "d3", "d4", "d5", "d6",
00942                       "d7", "d8", "d9", "d10", 
00943                       "d11", "d12", "d13", "d14",
00944                       "d15", "d16", "d17", "d18", "d19"
00945                     );
00946 
00947             }            
00948         }
00949         
00950         //std::cout << "Done creating fast thumbnail. time = " << ((Time::now()-startTime)/1000) << std::endl;
00951 
00952         return thumb;
00953     }
00954 }
00955 
00956 
00957 #endif

Generated on Fri Sep 24 2010 15:53:00 for FCam by  doxygen 1.7.1