PageRenderTime 68ms CodeModel.GetById 19ms app.highlight 42ms RepoModel.GetById 1ms app.codeStats 0ms

/ocr/ocrservice/jni/opticalflow/image.h

http://eyes-free.googlecode.com/
C++ Header | 779 lines | 474 code | 132 blank | 173 comment | 51 complexity | 30b321ae7bed2aab764ce116348a8428 MD5 | raw file
  1// Copyright 2009 Google Inc. All Rights Reserved.
  2// Author: andrewharp@google.com (Andrew Harp)
  3
  4#ifndef JAVA_COM_GOOGLE_ANDROID_APPS_UNVEIL_JNI_OPTICALFLOW_IMAGE_H_
  5#define JAVA_COM_GOOGLE_ANDROID_APPS_UNVEIL_JNI_OPTICALFLOW_IMAGE_H_
  6
  7#include "optical_flow_utils.h"
  8
  9// TODO(andrewharp): Make this a cast to uint32 if/when we go unsigned for
 10// operations.
 11#define ZERO 0
 12
 13#ifdef SANITY_CHECKS
 14  #define CHECK_PIXEL(IMAGE, X, Y) {\
 15    CHECK((IMAGE)->validPixel((X), (Y)), \
 16          "CHECK_PIXEL(%d,%d) in %dx%d image.", \
 17          static_cast<int32>(X), static_cast<int32>(Y), \
 18          (IMAGE)->getWidth(), (IMAGE)->getHeight());\
 19  }
 20
 21  #define CHECK_PIXEL_INTERP(IMAGE, X, Y) {\
 22    CHECK((IMAGE)->validInterpPixel((X), (Y)), \
 23          "CHECK_PIXEL_INTERP(%.2f, %.2f) in %dx%d image.", \
 24          static_cast<float32>(X), static_cast<float32>(Y), \
 25          (IMAGE)->getWidth(), (IMAGE)->getHeight());\
 26  }
 27#else
 28  #define CHECK_PIXEL(image, x, y) {}
 29  #define CHECK_PIXEL_INTERP(IMAGE, X, Y) {}
 30#endif
 31
 32namespace flow {
 33
 34// TODO(andrewharp): Make explicit which operations support negative numbers or
 35// struct/class types in image data (possibly create fast multi-dim array class
 36// for data where pixel arithmetic does not make sense).
 37
 38// Image class optimized for working on numeric arrays as grayscale image data.
 39// Supports other data types as a 2D array class, so long as no pixel math
 40// operations are called (convolution, downsampling, etc).
 41template <typename T>
 42class Image {
 43 public:
 44  Image(const int32 width, const int32 height) :
 45      width_(width),
 46      height_(height),
 47      width_less_one_(width_ - 1),
 48      height_less_one_(height_ - 1),
 49      num_pixels_(width_ * height_) {
 50    allocate();
 51  }
 52
 53  explicit Image(const Size& size) :
 54      width_(size.width),
 55      height_(size.height),
 56      width_less_one_(width_ - 1),
 57      height_less_one_(height_ - 1),
 58      num_pixels_(width_ * height_) {
 59    allocate();
 60  }
 61
 62  // Constructor that creates an image from preallocated data.
 63  // Note: The image takes ownership of the data.
 64  Image(const int32 width, const int32 height, T* const image) :
 65      width_(width),
 66      height_(height),
 67      width_less_one_(width_ - 1),
 68      height_less_one_(height_ - 1),
 69      num_pixels_(width_ * height_) {
 70    image_data_ = image;
 71    if (image_data_ == NULL) {
 72      LOGE("Can't create image with NULL data!");
 73    }
 74  }
 75
 76  ~Image() {
 77    free(image_data_);
 78  }
 79
 80  inline int32 getWidth() const { return width_; }
 81  inline int32 getHeight() const { return height_; }
 82
 83  // Bilinearly sample a value between pixels.
 84  // Values outside of the image are sampled from the nearest edge of the image.
 85  inline float32 getPixelInterp(const float32 x, const float32 y) const {
 86    // Do int32 conversion one time.
 87    const int32 floored_x = (int32) x;
 88    const int32 floored_y = (int32) y;
 89
 90    // Note: it might be the case that the *_[min|max] values are clipped, and
 91    // these (the a b c d vals) aren't (for speed purposes), but that doesn't
 92    // matter. We'll just be blending the pixel with itself in that case anyway.
 93    const float32 b = x - floored_x;
 94    const float32 a = 1.0f - b;
 95
 96    const float32 d = y - floored_y;
 97    const float32 c = 1.0f - d;
 98
 99    CHECK(validInterpPixel(x, y),
100          "x or y out of bounds! %.2f [0 - %d), %.2f [0 - %d)",
101          x, width_less_one_, y, height_less_one_);
102
103    const T* const pix_ptr = getPixelPtrConst(floored_x, floored_y);
104
105// Experimental NEON acceleration... not to be turned on until it's faster.
106#if FALSE
107#ifdef HAVE_ARMEABI_V7A
108    if (supportsNeon()) {
109      // Output value:
110      // a * c * p1 +
111      // b * c * p2 +
112      // a * d * p3 +
113      // b * d * p4
114      const float32x2_t ab = {a, b};
115      const float32x4_t ab_c_ab_d = vcombine_f32(vmul_n_f32(ab, c),
116                                                 vmul_n_f32(ab, d));
117
118      const float32x4_t p1p2p3p4 = {pix_ptr[0], pix_ptr[1],
119                                    pix_ptr[width_], pix_ptr[width_ + 1]};
120
121      float32x4_t almost = vmulq_f32(ab_c_ab_d, p1p2p3p4);
122
123      // Butterfly-ish sum.
124      almost = vaddq_f32(vrev64q_f32(almost), almost);
125
126      return vgetq_lane_f32(almost, 0) + vgetq_lane_f32(almost, 1);
127    }
128#endif
129#endif
130
131    // Get the pixel values surrounding this point.
132    const T& p1 = pix_ptr[0];
133    const T& p2 = pix_ptr[1];
134    const T& p3 = pix_ptr[width_];
135    const T& p4 = pix_ptr[width_ + 1];
136
137    // Simple bilinear interpolation between four reference pixels.
138    // If x is the value requested:
139    //     a  b
140    //   -------
141    // c |p1 p2|
142    //   |  x  |
143    // d |p3 p4|
144    //   -------
145    return  c * ((a * p1) + (b * p2)) +
146            d * ((a * p3) + (b * p4));
147  }
148
149  // Returns true iff the pixel is in the image's boundaries.
150  inline bool validPixel(const int32 x, const int32 y) const {
151    return inRange(x, ZERO, width_less_one_) &&
152           inRange(y, ZERO, height_less_one_);
153  }
154
155  // Returns true iff the pixel is in the image's boundaries for interpolation
156  // purposes.
157  // TODO(andrewharp): check in interpolation follow-up change.
158  inline bool validInterpPixel(const float32 x, const float32 y) const {
159    // Exclusive of max because we can be more efficient if we don't handle
160    // interpolating on or past the last pixel.
161    return (x >= ZERO) && (x < width_less_one_) &&
162           (y >= ZERO) && (y < height_less_one_);
163  }
164
165  // Safe lookup with boundary enforcement.
166  inline T getPixelClipped(const int32 x, const int32 y) const {
167    return getPixel(clip(x, ZERO, width_less_one_),
168                    clip(y, ZERO, height_less_one_));
169  }
170
171  // Returns a const pointer to the pixel in question.
172  inline const T* getPixelPtrConst(const int32 x, const int32 y) const {
173    CHECK_PIXEL(this, x, y);
174    return image_data_ + y * width_ + x;
175  }
176
177  // Returns a pointer to the pixel in question.
178  inline T* getPixelPtr(const int32 x, const int32 y) const {
179    CHECK_PIXEL(this, x, y);
180    return image_data_ + y * width_ + x;
181  }
182
183  // Fast lookup without boundary enforcement.
184  inline const T getPixel(const int32 x, const int32 y) const {
185    CHECK_PIXEL(this, x, y);
186    return image_data_[y * width_ + x];
187  }
188
189  // Fast setting without boundary enforcement.
190  inline void setPixel(const int32 x, const int32 y, const T& val) {
191    CHECK_PIXEL(this, x, y);
192    image_data_[y * width_ + x] = val;
193  }
194
195  // Clears image to a single value.
196  inline void clear(const T& val) {
197    memset(image_data_, val, sizeof(*image_data_) * num_pixels_);
198  }
199
200
201#ifdef HAVE_ARMEABI_V7A
202  // This function does the bulk of the work.
203  inline void downsample32ColumnsNeon(const uint8* const original,
204                                      const int32 stride,
205                                      const int32 orig_x) {
206    // Divide input x offset by 4 to find output offset.
207    const int32 new_x = orig_x >> 2;
208
209    // Initial offset into top row.
210    const uint8* offset = original + orig_x;
211
212    // Sum along vertical columns.
213    // Process 32x4 input pixels and 8x1 output pixels per iteration.
214    for (int32 new_y = 0; new_y < height_; ++new_y) {
215      uint16x8_t accum1 = vdupq_n_u16(0);
216      uint16x8_t accum2 = vdupq_n_u16(0);
217
218      // Go top to bottom across the four rows of input pixels that make up
219      // this output row.
220      for (int32 row_num = 0; row_num < 4; ++row_num) {
221        // First 16 bytes.
222        {
223          // Load 32 bytes of data from current offset.
224          const uint8x16_t curr_data1 = vld1q_u8(offset);
225          // Pairwise add and accumulate into accum vectors (16 bit to account
226          // for values above 255).
227          accum1 = vpadalq_u8(accum1, curr_data1);
228        }
229
230        // Second 16 bytes.
231        {
232          // Load 32 bytes of data from current offset.
233          const uint8x16_t curr_data2 = vld1q_u8(offset + 16);
234          // Pairwise add and accumulate into accum vectors (16 bit to account
235          // for values above 255).
236          accum2 = vpadalq_u8(accum2, curr_data2);
237        }
238
239        // Move offset down one row.
240        offset += stride;
241      }
242
243      // Add and widen, then divide by 16 (number of input pixels per output
244      // pixel) and narrow data from 32 bits per pixel to 16 bpp.
245      const uint16x4_t tmp_pix1 = vqshrn_n_u32(vpaddlq_u16(accum1), 4);
246      const uint16x4_t tmp_pix2 = vqshrn_n_u32(vpaddlq_u16(accum2), 4);
247
248      // Combine 4x1 pixel strips into 8x1 pixel strip and narrow from
249      // 16 bits to 8 bits per pixel.
250      const uint8x8_t allpixels = vmovn_u16(vcombine_u16(tmp_pix1, tmp_pix2));
251
252      // This points to the leftmost pixel of our 8 horizontally arranged
253      // pixels in the destination data.
254      uint8* const ptr_dst = getPixelPtr(new_x, new_y);
255
256      // Copy all pixels from composite 8x1 vector into output strip.
257      vst1_u8(ptr_dst, allpixels);
258    }
259  }
260
261
262  // Hardware accelerated downsampling method for supported devices.
263  // Requires that image size be a multiple of 16 pixels in each dimension,
264  // and that downsampling be by a factor of 4.
265  void downsampleAveragedNeon(const uint8* const original,
266                              const int32 stride) {
267    // Hardcoded to only work on 4x downsampling.
268    const int32 orig_width = width_ * 4;
269
270    // We process 32 input pixels lengthwise at a time.
271    // The output per pass of this loop is an 8 wide by 1 tall pixel strip.
272    for (int32 orig_x = 0; orig_x < orig_width; orig_x += 32) {
273      // Push it to the left enough so that it never goes out of bounds.
274      // This will result in some extra computation on the last pass on
275      // devices whose frame widths are not multiples of 32.
276      downsample32ColumnsNeon(original, stride, min(orig_x, orig_width - 32));
277    }
278  }
279#endif
280
281
282  // Naive downsampler that reduces image size by factor by averaging pixels in
283  // blocks of size factor x factor.
284  void downsampleAveraged(const T* const original, const int32 stride,
285                          const int32 factor) {
286#ifdef HAVE_ARMEABI_V7A
287    if (supportsNeon() &&
288        factor == 4 &&
289        (height_ % 4) == 0) {
290      downsampleAveragedNeon(original, stride);
291      return;
292    }
293#endif
294
295    const int32 pixels_per_block = factor * factor;
296
297    // For every pixel in resulting image.
298    for (int32 y = 0; y < height_; ++y) {
299      const int32 orig_y = y * factor;
300      const int32 y_bound = orig_y + factor;
301
302      // Sum up the original pixels.
303      for (int32 x = 0; x < width_; ++x) {
304        const int32 orig_x = x * factor;
305        const int32 x_bound = orig_x + factor;
306
307        // Making this int32 because type U or T might overflow.
308        int32 pixel_sum = 0;
309
310        // Grab all the pixels that make up this pixel.
311        for (int32 curr_y = orig_y; curr_y < y_bound; ++curr_y) {
312          const T* p = original + curr_y * stride + orig_x;
313
314          for (int32 curr_x = orig_x; curr_x < x_bound; ++curr_x) {
315            pixel_sum += *p++;
316          }
317        }
318
319        setPixel(x, y, pixel_sum / pixels_per_block);
320      }
321    }
322  }
323
324  // Naive downsampler that reduces image size by factor by averaging pixels in
325  // blocks of size factor x factor.
326  void downsampleAveraged(const Image<T>& original, const int32 factor) {
327    downsampleAveraged(original.getPixelPtr(0, 0), original.getWidth(), factor);
328  }
329
330  // Relatively efficient downsampling of an image by a factor of two with a
331  // low-pass 3x3 smoothing operation thrown in.
332  void downsampleSmoothed3x3(const Image<T>& original) {
333    for (int32 y = 0; y < height_; ++y) {
334      const int32 orig_y = clip(2 * y, ZERO, original.height_less_one_);
335      const int32 min_y = clip(orig_y - 1, ZERO, original.height_less_one_);
336      const int32 max_y = clip(orig_y + 1, ZERO, original.height_less_one_);
337
338      for (int32 x = 0; x < width_; ++x) {
339        const int32 orig_x = clip(2 * x, ZERO, original.width_less_one_);
340        const int32 min_x = clip(orig_x - 1, ZERO, original.width_less_one_);
341        const int32 max_x = clip(orig_x + 1, ZERO, original.width_less_one_);
342
343        // Center.
344        int32 pixel_sum = original.getPixel(orig_x, orig_y) * 4;
345
346        // Sides.
347        pixel_sum += (original.getPixel(max_x, orig_y) +
348                      original.getPixel(min_x, orig_y) +
349                      original.getPixel(orig_x, max_y) +
350                      original.getPixel(orig_x, min_y)) * 2;
351
352        // Diagonals.
353        pixel_sum += (original.getPixel(max_x, max_y) +
354                      original.getPixel(min_x, max_y) +
355                      original.getPixel(max_x, min_y) +
356                      original.getPixel(min_x, min_y));
357
358        const int32 pixel_val = pixel_sum>>4;  // 16
359
360        //LOGV("Setting %d,%d to %d", col, row, pixel_val);
361
362        setPixel(x, y, pixel_val);
363      }
364    }
365  }
366
367  // Relatively efficient downsampling of an image by a factor of two with a
368  // low-pass 5x5 smoothing operation thrown in.
369  void downsampleSmoothed5x5(const Image<T>& original) {
370    const int32 max_x = original.width_less_one_;
371    const int32 max_y = original.height_less_one_;
372
373    // The JY Bouget paper on Lucas-Kanade recommends a
374    // [1/16 1/4 3/8 1/4 1/16]^2 filter.
375    // This works out to a [1 4 6 4 1]^2 / 256 array, precomputed below.
376    static const int32 window_radius = 2;
377    static const int32 window_size = window_radius*2 + 1;
378    static const int32 window_weights[] = {1, 4, 6, 4,1,  // 16 +
379                                           4,16,24,16,4,  // 64 +
380                                           6,24,36,24,6,  // 96 +
381                                           4,16,24,16,4,  // 64 +
382                                           1, 4, 6, 4,1}; // 16 = 256
383
384    // We'll multiply and sum with the the whole numbers first, then divide by
385    // the total weight to normalize at the last moment.
386    for (int32 y = 0; y < height_; ++y) {
387      for (int32 x = 0; x < width_; ++x) {
388        int32 pixel_sum = 0;
389
390        const int32* w = window_weights;
391        const int32 start_x = clip((x<<1) - window_radius, ZERO, max_x);
392
393        // Clip the boundaries to the size of the image.
394        for (int32 window_y = 0; window_y < window_size; ++window_y) {
395          const int32 start_y =
396              clip((y<<1) - window_radius + window_y, ZERO, max_y);
397
398          const T* p = original.getPixelPtrConst(start_x, start_y);
399
400          for (int32 window_x = 0; window_x < window_size; ++window_x) {
401            pixel_sum +=  *p++ * *w++;
402          }
403        }
404
405        // Conversion to type T will happen here after shifting right 8 bits to
406        // divide by 256.
407        setPixel(x, y, pixel_sum >> 8);
408      }
409    }
410  }
411
412  // Optimized Scharr filter on a single pixel in the X direction.
413  // Scharr filters are like central-difference operators, but have more
414  // rotational symmetry in their response because they also consider the
415  // diagonal neighbors.
416  template <typename U>
417  inline T scharrPixelX(const Image<U>& original,
418                        const int32 center_x, const int32 center_y) const {
419    const int32 min_x = clip(center_x - 1, ZERO, original.width_less_one_);
420    const int32 max_x = clip(center_x + 1, ZERO, original.width_less_one_);
421    const int32 min_y = clip(center_y - 1, ZERO, original.height_less_one_);
422    const int32 max_y = clip(center_y + 1, ZERO, original.height_less_one_);
423
424    // Convolution loop unrolled for performance...
425    return (3 * (original.getPixel(max_x, min_y)
426                 + original.getPixel(max_x, max_y)
427                 - original.getPixel(min_x, min_y)
428                 - original.getPixel(min_x, min_y))
429            + 10 * (original.getPixel(max_x, center_y)
430                    - original.getPixel(min_x, center_y))) / 32;
431  }
432
433  // Optimized Scharr filter on a single pixel in the X direction.
434  // Scharr filters are like central-difference operators, but have more
435  // rotational symmetry in their response because they also consider the
436  // diagonal neighbors.
437  template <typename U>
438  inline T scharrPixelY(const Image<U>& original,
439                        const int32 center_x, const int32 center_y) const {
440    const int32 min_x = clip(center_x - 1, 0, original.width_less_one_);
441    const int32 max_x = clip(center_x + 1, 0, original.width_less_one_);
442    const int32 min_y = clip(center_y - 1, 0, original.height_less_one_);
443    const int32 max_y = clip(center_y + 1, 0, original.height_less_one_);
444
445    // Convolution loop unrolled for performance...
446    return (3 * (original.getPixel(min_x, max_y)
447                 + original.getPixel(max_x, max_y)
448                 - original.getPixel(min_x, min_y)
449                 - original.getPixel(max_x, min_y))
450            + 10 * (original.getPixel(center_x, max_y)
451                    - original.getPixel(center_x, min_y))) / 32;
452  }
453
454  // Convolve the image with a Scharr filter in the X direction.
455  // Much faster than an equivalent generic convolution.
456  template <typename U>
457  inline void scharrX(const Image<U>& original) {
458    for (int32 y = 0; y < height_; ++y) {
459      for (int32 x = 0; x < width_; ++x) {
460        setPixel(x, y, scharrPixelX(original, x, y));
461      }
462    }
463  }
464
465  // Convolve the image with a Scharr filter in the Y direction.
466  // Much faster than an equivalent generic convolution.
467  template <typename U>
468  inline void scharrY(const Image<U>& original) {
469    for (int32 y = 0; y < height_; ++y) {
470      for (int32 x = 0; x < width_; ++x) {
471        setPixel(x, y, scharrPixelY(original, x, y));
472      }
473    }
474  }
475
476  static inline T halfDiff(int32 first, int32 second) {
477    return (second - first) / 2;
478  }
479
480  template <typename U>
481  void derivativeX(const Image<U>& original) {
482    for (int32 y = 0; y < height_; ++y) {
483      T* const dest_row = getPixelPtr(0, y);
484      const U* const source_row = original.getPixelPtrConst(0, y);
485
486      // Compute first pixel.
487      dest_row[0] = halfDiff(source_row[0], source_row[1]);
488
489      // Last pixel.
490      dest_row[width_less_one_] = halfDiff(source_row[width_less_one_ - 1],
491                                           source_row[width_less_one_]);
492
493      // All the pixels in between.
494      const U* const source_prev_pixel = source_row - 1;
495      const U* const source_next_pixel = source_row + 1;
496      for (int32 x = 1; x < width_less_one_; ++x) {
497        dest_row[x] = halfDiff(source_prev_pixel[x], source_next_pixel[x]);
498      }
499    }
500  }
501
502  template <typename U>
503  void derivativeY(const Image<U>& original) {
504    for (int32 y = 0; y < height_; ++y) {
505      T* const dest_row = getPixelPtr(0, y);
506
507      const U* const source_prev_pixel =
508          original.getPixelPtrConst(0, max(0, y - 1));
509
510      const U* const source_next_pixel =
511          original.getPixelPtrConst(0, min(height_less_one_, y + 1));
512
513      for (int32 x = 0; x < width_; ++x) {
514        dest_row[x] = halfDiff(source_prev_pixel[x], source_next_pixel[x]);
515      }
516    }
517  }
518
519  // Generic function for convolving pixel with 3x3 filter.
520  // Filter pixels should be in row major order.
521  template <typename U>
522  inline T convolvePixel3x3(const Image<U>& original,
523                            const int32* const filter,
524                            const int32 center_x, const int32 center_y,
525                            const int32 total) const {
526    int32 sum = 0;
527    for (int32 filter_y = 0; filter_y < 3; ++filter_y) {
528      const int32 y = clip(center_y - 1 + filter_y, 0, original.getHeight());
529      for (int32 filter_x = 0; filter_x < 3; ++filter_x) {
530        const int32 x = clip(center_x - 1 + filter_x, 0, original.getWidth());
531        sum += original.getPixel(x, y) * filter[filter_y * 3 + filter_x];
532      }
533    }
534    return sum / total;
535  }
536
537  // Generic function for convolving an image with a 3x3 filter.
538  // TODO(andrewharp): Generalize this for any size filter.
539  template <typename U>
540  inline void convolve3x3(const Image<U>& original,
541                          const int32* const filter) {
542    int32 sum = 0;
543    for (int32 i = 0; i < 9; ++i) {
544      sum += abs(filter[i]);
545    }
546    for (int32 y = 0; y < height_; ++y) {
547      for (int32 x = 0; x < width_; ++x) {
548        setPixel(x, y, convolvePixel3x3(original, filter, x, y, sum));
549      }
550    }
551  }
552
553  // Load this image's data from a data array. The data at pixels is assumed to
554  // have dimensions equivalent to this image's dimensions * factor.
555  inline void fromArray(const T* const pixels, const int32 stride,
556                        const int32 factor) {
557    if (factor == 1) {
558      // If not subsampling, memcpy per line should be faster.
559      memcpy(this->image_data_, pixels, num_pixels_ * sizeof(T));
560      return;
561    }
562
563    downsampleAveraged(pixels, stride, factor);
564  }
565
566  // Copy the image back out to an appropriately sized data array.
567  inline void toArray(T* const pixels) const {
568    // If not subsampling, memcpy should be faster.
569    memcpy(pixels, this->image_data_, num_pixels_ * sizeof(T));
570  }
571
572 private:
573  inline void allocate() {
574    image_data_ = (T*)malloc(num_pixels_ * sizeof(T));
575    if (image_data_ == NULL) {
576      LOGE("Couldn't allocate image data!");
577    }
578  }
579
580  T* image_data_;
581  int32 width_;
582  int32 height_;
583
584 public:
585  // Precompute these for efficiency's sake as they're used by a lot of
586  // clipping code and loop code.
587  // TODO(andrewharp): make these only accessible by other Images.
588  int32 width_less_one_;
589  int32 height_less_one_;
590  int32 num_pixels_;
591};
592
593
594// Create a pyramid of downsampled images. The first level of the pyramid is the
595// original image.
596inline void computeSmoothedPyramid(const Image<uint8>& frame,
597                                   const int32 num_levels,
598                                   Image<uint8>** const pyramid) {
599  // TODO(andrewharp): Find a const correct solution to this...
600  // Maybe make an pyramid class with the first level of this pyramid as a
601  // separate pointer?
602
603  // Cast away const, but we're not going to hurt it, honest!
604  pyramid[0] = const_cast<Image<uint8>*>(&frame);
605
606  for (int32 l = 1; l < num_levels; ++l) {
607    pyramid[l]->downsampleSmoothed3x3(*pyramid[l - 1]);
608  }
609}
610
611
612// Create a spatial derivative pyramid based on a downsampled pyramid.
613inline void computeSpatialPyramid(const Image<uint8>** const pyramid,
614                                  const int32 num_levels,
615                                  Image<int32>** pyramid_x,
616                                  Image<int32>** pyramid_y) {
617  for (int32 l = 0; l < num_levels; ++l) {
618    const Image<uint8>& frame = *pyramid[l];
619
620    // Fast convolutions to find spatial derivatives.
621    pyramid_x[l]->derivativeX(frame);
622    pyramid_y[l]->derivativeY(frame);
623  }
624}
625
626
627// Marks a circle of a given radius on the boolean image.
628// If the center spot is already marked, don't do anything and return false.
629// Otherwise, mark everything in range true and return true.
630template <typename U>
631inline static bool markImage(const int32 x, const int32 y,
632                             const int32 radius,
633                             Image<U>* const img) {
634  if (img->getPixel(x, y)) {
635    // Already claimed, sorry.
636    return false;
637  }
638
639  const int32 squared_radius = square(radius);
640
641  for (int32 d_y = 0; d_y < radius; ++d_y) {
642    const int32 squared_y_dist = square(d_y);
643
644    const int32 min_y = y > d_y ? y - d_y : ZERO;
645    const int32 max_y = min(y + d_y, img->height_less_one_);
646
647    for (int32 d_x = 0; d_x < radius; ++d_x) {
648      if (squared_y_dist + square(d_x) <= squared_radius) {
649        const int32 min_x = x > d_x ? x - d_x : ZERO;
650        const int32 max_x = min(x + d_x, img->width_less_one_);
651
652        // Mark all four quadrants.
653        img->setPixel(max_x, max_y, true);
654        img->setPixel(max_x, min_y, true);
655        img->setPixel(min_x, max_y, true);
656        img->setPixel(min_x, min_y, true);
657      } else {
658        // Once we're too far out, we're not coming back in.
659        break;
660      }
661    }
662  }
663  return true;
664}
665
666
667// Puts the image gradient matrix about a pixel into the 2x2 float array G.
668// vals_x should be an array of the window x gradient values, whose indices
669// can be in any order but are parallel to the vals_y entries.
670// See http://robots.stanford.edu/cs223b04/algo_tracking.pdf for more details.
671inline void calculateG(const float32* const vals_x, const float32* const vals_y,
672                       const int32 num_vals, float* const G) {
673  // Defined here because we want to keep track of how many values were
674  // processed by NEON, so that we can finish off the remainder the normal
675  // way.
676  int32 i = 0;
677
678#ifdef HAVE_ARMEABI_V7A
679  if (supportsNeon()) {
680    const float32_t* const arm_vals_x = (const float32_t*) vals_x;
681    const float32_t* const arm_vals_y = (const float32_t*) vals_y;
682
683    // Running sums.
684    float32x4_t xx = vdupq_n_f32(0.0f);
685    float32x4_t xy = vdupq_n_f32(0.0f);
686    float32x4_t yy = vdupq_n_f32(0.0f);
687
688    // Maximum index we can load 4 consecutive values from.
689    // e.g. if there are 81 values, our last full pass can be from index 77:
690    // 81-4=>77 (77, 78, 79, 80)
691    const int32 max_i = num_vals - 4;
692
693    // Process values 4 at a time, accumulating the sums of
694    // the pixel-wise x*x, x*y, and y*y values.
695    for (; i <= max_i; i += 4) {
696      // Load xs
697      float32x4_t x = vld1q_f32(arm_vals_x + i);
698
699      // Multiply x*x and accumulate.
700      xx = vmlaq_f32(xx, x, x);
701
702      // Load ys
703      float32x4_t y = vld1q_f32(arm_vals_y + i);
704
705      // Multiply x*y and accumulate.
706      xy = vmlaq_f32(xy, x, y);
707
708      // Multiply y*y and accumulate.
709      yy = vmlaq_f32(yy, y, y);
710    }
711
712    static float32_t xx_vals[4];
713    static float32_t xy_vals[4];
714    static float32_t yy_vals[4];
715
716    vst1q_f32(xx_vals, xx);
717    vst1q_f32(xy_vals, xy);
718    vst1q_f32(yy_vals, yy);
719
720    // Accumulated values are store in sets of 4, we have to manually add
721    // the last bits together.
722    for (int32 j = 0; j < 4; ++j) {
723      G[0] += xx_vals[j];
724      G[1] += xy_vals[j];
725      G[3] += yy_vals[j];
726    }
727  }
728#endif
729  // Non-accelerated version, also finishes off last few values (< 4) from
730  // above.
731  for (; i < num_vals; ++i) {
732    G[0] += square(vals_x[i]);
733    G[1] += vals_x[i] * vals_y[i];
734    G[3] += square(vals_y[i]);
735  }
736
737  // The matrix is symmetric, so this is a given.
738  G[2] = G[1];
739}
740
741
742// Puts the image gradient matrix about a pixel into the 2x2 float array G.
743// Looks up interpolated pixels, then calls above method for implementation.
744inline void calculateG(const int window_size,
745                       const float32 center_x, const float center_y,
746                       const Image<int32>& I_x, const Image<int32>& I_y,
747                       float* const G) {
748  CHECK(I_x.validPixel(center_x, center_y), "Problem in calculateG!");
749
750  // Hardcoded to allow for a max window radius of 5 (9 pixels x 9 pixels).
751  static const int kMaxWindowRadius = 5;
752  CHECK(window_size <= kMaxWindowRadius,
753        "Window %d > %d!", window_size, kMaxWindowRadius);
754
755  // Diameter of window is 2 * radius + 1 for center pixel.
756  static const int kWindowBufferSize =
757      (kMaxWindowRadius * 2 + 1) * (kMaxWindowRadius * 2 + 1);
758
759  // Preallocate buffers statically for efficiency.
760  static float32 vals_x[kWindowBufferSize];
761  static float32 vals_y[kWindowBufferSize];
762
763  int32 num_vals = 0;
764
765  for (int32 win_x = -window_size; win_x <= window_size; ++win_x) {
766    for (int32 win_y = -window_size; win_y <= window_size; ++win_y) {
767      vals_x[num_vals] = I_x.getPixelInterp(center_x + win_x,
768                                            center_y + win_y);
769      vals_y[num_vals] = I_y.getPixelInterp(center_x + win_x,
770                                            center_y + win_y);
771      ++num_vals;
772    }
773  }
774  calculateG(vals_x, vals_y, num_vals, G);
775}
776
777}  // namespace flow
778
779#endif  // JAVA_COM_GOOGLE_ANDROID_APPS_UNVEIL_JNI_OPTICALFLOW_IMAGE_H_