image.h - This C header file provides an interface for opti…

/ocr/ocrservice/jni/opticalflow/image.h

http://eyes-free.googlecode.com/ · C Header · 779 lines · 474 code · 132 blank · 173 comment · 51 complexity · 30b321ae7bed2aab764ce116348a8428 MD5 · raw file

// Copyright 2009 Google Inc. All Rights Reserved.
// Author: andrewharp@google.com (Andrew Harp)

#ifndef JAVA_COM_GOOGLE_ANDROID_APPS_UNVEIL_JNI_OPTICALFLOW_IMAGE_H_
#define JAVA_COM_GOOGLE_ANDROID_APPS_UNVEIL_JNI_OPTICALFLOW_IMAGE_H_

#include "optical_flow_utils.h"

// TODO(andrewharp): Make this a cast to uint32 if/when we go unsigned for
// operations.
#define ZERO 0

#ifdef SANITY_CHECKS
  #define CHECK_PIXEL(IMAGE, X, Y) {\
    CHECK((IMAGE)->validPixel((X), (Y)), \
          "CHECK_PIXEL(%d,%d) in %dx%d image.", \
          static_cast<int32>(X), static_cast<int32>(Y), \
          (IMAGE)->getWidth(), (IMAGE)->getHeight());\
  }

  #define CHECK_PIXEL_INTERP(IMAGE, X, Y) {\
    CHECK((IMAGE)->validInterpPixel((X), (Y)), \
          "CHECK_PIXEL_INTERP(%.2f, %.2f) in %dx%d image.", \
          static_cast<float32>(X), static_cast<float32>(Y), \
          (IMAGE)->getWidth(), (IMAGE)->getHeight());\
  }
#else
  #define CHECK_PIXEL(image, x, y) {}
  #define CHECK_PIXEL_INTERP(IMAGE, X, Y) {}
#endif

namespace flow {

// TODO(andrewharp): Make explicit which operations support negative numbers or
// struct/class types in image data (possibly create fast multi-dim array class
// for data where pixel arithmetic does not make sense).

// Image class optimized for working on numeric arrays as grayscale image data.
// Supports other data types as a 2D array class, so long as no pixel math
// operations are called (convolution, downsampling, etc).
template <typename T>
class Image {
 public:
  Image(const int32 width, const int32 height) :
      width_(width),
      height_(height),
      width_less_one_(width_ - 1),
      height_less_one_(height_ - 1),
      num_pixels_(width_ * height_) {
    allocate();
  }

  explicit Image(const Size& size) :
      width_(size.width),
      height_(size.height),
      width_less_one_(width_ - 1),
      height_less_one_(height_ - 1),
      num_pixels_(width_ * height_) {
    allocate();
  }

  // Constructor that creates an image from preallocated data.
  // Note: The image takes ownership of the data.
  Image(const int32 width, const int32 height, T* const image) :
      width_(width),
      height_(height),
      width_less_one_(width_ - 1),
      height_less_one_(height_ - 1),
      num_pixels_(width_ * height_) {
    image_data_ = image;
    if (image_data_ == NULL) {
      LOGE("Can't create image with NULL data!");
    }
  }

  ~Image() {
    free(image_data_);
  }

  inline int32 getWidth() const { return width_; }
  inline int32 getHeight() const { return height_; }

  // Bilinearly sample a value between pixels.
  // Values outside of the image are sampled from the nearest edge of the image.
  inline float32 getPixelInterp(const float32 x, const float32 y) const {
    // Do int32 conversion one time.
    const int32 floored_x = (int32) x;
    const int32 floored_y = (int32) y;

    // Note: it might be the case that the *_[min|max] values are clipped, and
    // these (the a b c d vals) aren't (for speed purposes), but that doesn't
    // matter. We'll just be blending the pixel with itself in that case anyway.
    const float32 b = x - floored_x;
    const float32 a = 1.0f - b;

    const float32 d = y - floored_y;
    const float32 c = 1.0f - d;

    CHECK(validInterpPixel(x, y),
          "x or y out of bounds! %.2f [0 - %d), %.2f [0 - %d)",
          x, width_less_one_, y, height_less_one_);

    const T* const pix_ptr = getPixelPtrConst(floored_x, floored_y);

// Experimental NEON acceleration... not to be turned on until it's faster.
#if FALSE
#ifdef HAVE_ARMEABI_V7A
    if (supportsNeon()) {
      // Output value:
      // a * c * p1 +
      // b * c * p2 +
      // a * d * p3 +
      // b * d * p4
      const float32x2_t ab = {a, b};
      const float32x4_t ab_c_ab_d = vcombine_f32(vmul_n_f32(ab, c),
                                                 vmul_n_f32(ab, d));

      const float32x4_t p1p2p3p4 = {pix_ptr[0], pix_ptr[1],
                                    pix_ptr[width_], pix_ptr[width_ + 1]};

      float32x4_t almost = vmulq_f32(ab_c_ab_d, p1p2p3p4);

      // Butterfly-ish sum.
      almost = vaddq_f32(vrev64q_f32(almost), almost);

      return vgetq_lane_f32(almost, 0) + vgetq_lane_f32(almost, 1);
    }
#endif
#endif

    // Get the pixel values surrounding this point.
    const T& p1 = pix_ptr[0];
    const T& p2 = pix_ptr[1];
    const T& p3 = pix_ptr[width_];
    const T& p4 = pix_ptr[width_ + 1];

    // Simple bilinear interpolation between four reference pixels.
    // If x is the value requested:
    //     a  b
    //   -------
    // c |p1 p2|
    //   |  x  |
    // d |p3 p4|
    //   -------
    return  c * ((a * p1) + (b * p2)) +
            d * ((a * p3) + (b * p4));
  }

  // Returns true iff the pixel is in the image's boundaries.
  inline bool validPixel(const int32 x, const int32 y) const {
    return inRange(x, ZERO, width_less_one_) &&
           inRange(y, ZERO, height_less_one_);
  }

  // Returns true iff the pixel is in the image's boundaries for interpolation
  // purposes.
  // TODO(andrewharp): check in interpolation follow-up change.
  inline bool validInterpPixel(const float32 x, const float32 y) const {
    // Exclusive of max because we can be more efficient if we don't handle
    // interpolating on or past the last pixel.
    return (x >= ZERO) && (x < width_less_one_) &&
           (y >= ZERO) && (y < height_less_one_);
  }

  // Safe lookup with boundary enforcement.
  inline T getPixelClipped(const int32 x, const int32 y) const {
    return getPixel(clip(x, ZERO, width_less_one_),
                    clip(y, ZERO, height_less_one_));
  }

  // Returns a const pointer to the pixel in question.
  inline const T* getPixelPtrConst(const int32 x, const int32 y) const {
    CHECK_PIXEL(this, x, y);
    return image_data_ + y * width_ + x;
  }

  // Returns a pointer to the pixel in question.
  inline T* getPixelPtr(const int32 x, const int32 y) const {
    CHECK_PIXEL(this, x, y);
    return image_data_ + y * width_ + x;
  }

  // Fast lookup without boundary enforcement.
  inline const T getPixel(const int32 x, const int32 y) const {
    CHECK_PIXEL(this, x, y);
    return image_data_[y * width_ + x];
  }

  // Fast setting without boundary enforcement.
  inline void setPixel(const int32 x, const int32 y, const T& val) {
    CHECK_PIXEL(this, x, y);
    image_data_[y * width_ + x] = val;
  }

  // Clears image to a single value.
  inline void clear(const T& val) {
    memset(image_data_, val, sizeof(*image_data_) * num_pixels_);
  }


#ifdef HAVE_ARMEABI_V7A
  // This function does the bulk of the work.
  inline void downsample32ColumnsNeon(const uint8* const original,
                                      const int32 stride,
                                      const int32 orig_x) {
    // Divide input x offset by 4 to find output offset.
    const int32 new_x = orig_x >> 2;

    // Initial offset into top row.
    const uint8* offset = original + orig_x;

    // Sum along vertical columns.
    // Process 32x4 input pixels and 8x1 output pixels per iteration.
    for (int32 new_y = 0; new_y < height_; ++new_y) {
      uint16x8_t accum1 = vdupq_n_u16(0);
      uint16x8_t accum2 = vdupq_n_u16(0);

      // Go top to bottom across the four rows of input pixels that make up
      // this output row.
      for (int32 row_num = 0; row_num < 4; ++row_num) {
        // First 16 bytes.
        {
          // Load 32 bytes of data from current offset.
          const uint8x16_t curr_data1 = vld1q_u8(offset);
          // Pairwise add and accumulate into accum vectors (16 bit to account
          // for values above 255).
          accum1 = vpadalq_u8(accum1, curr_data1);
        }

        // Second 16 bytes.
        {
          // Load 32 bytes of data from current offset.
          const uint8x16_t curr_data2 = vld1q_u8(offset + 16);
          // Pairwise add and accumulate into accum vectors (16 bit to account
          // for values above 255).
          accum2 = vpadalq_u8(accum2, curr_data2);
        }

        // Move offset down one row.
        offset += stride;
      }

      // Add and widen, then divide by 16 (number of input pixels per output
      // pixel) and narrow data from 32 bits per pixel to 16 bpp.
      const uint16x4_t tmp_pix1 = vqshrn_n_u32(vpaddlq_u16(accum1), 4);
      const uint16x4_t tmp_pix2 = vqshrn_n_u32(vpaddlq_u16(accum2), 4);

      // Combine 4x1 pixel strips into 8x1 pixel strip and narrow from
      // 16 bits to 8 bits per pixel.
      const uint8x8_t allpixels = vmovn_u16(vcombine_u16(tmp_pix1, tmp_pix2));

      // This points to the leftmost pixel of our 8 horizontally arranged
      // pixels in the destination data.
      uint8* const ptr_dst = getPixelPtr(new_x, new_y);

      // Copy all pixels from composite 8x1 vector into output strip.
      vst1_u8(ptr_dst, allpixels);
    }
  }


  // Hardware accelerated downsampling method for supported devices.
  // Requires that image size be a multiple of 16 pixels in each dimension,
  // and that downsampling be by a factor of 4.
  void downsampleAveragedNeon(const uint8* const original,
                              const int32 stride) {
    // Hardcoded to only work on 4x downsampling.
    const int32 orig_width = width_ * 4;

    // We process 32 input pixels lengthwise at a time.
    // The output per pass of this loop is an 8 wide by 1 tall pixel strip.
    for (int32 orig_x = 0; orig_x < orig_width; orig_x += 32) {
      // Push it to the left enough so that it never goes out of bounds.
      // This will result in some extra computation on the last pass on
      // devices whose frame widths are not multiples of 32.
      downsample32ColumnsNeon(original, stride, min(orig_x, orig_width - 32));
    }
  }
#endif


  // Naive downsampler that reduces image size by factor by averaging pixels in
  // blocks of size factor x factor.
  void downsampleAveraged(const T* const original, const int32 stride,
                          const int32 factor) {
#ifdef HAVE_ARMEABI_V7A
    if (supportsNeon() &&
        factor == 4 &&
        (height_ % 4) == 0) {
      downsampleAveragedNeon(original, stride);
      return;
    }
#endif

    const int32 pixels_per_block = factor * factor;

    // For every pixel in resulting image.
    for (int32 y = 0; y < height_; ++y) {
      const int32 orig_y = y * factor;
      const int32 y_bound = orig_y + factor;

      // Sum up the original pixels.
      for (int32 x = 0; x < width_; ++x) {
        const int32 orig_x = x * factor;
        const int32 x_bound = orig_x + factor;

        // Making this int32 because type U or T might overflow.
        int32 pixel_sum = 0;

        // Grab all the pixels that make up this pixel.
        for (int32 curr_y = orig_y; curr_y < y_bound; ++curr_y) {
          const T* p = original + curr_y * stride + orig_x;

          for (int32 curr_x = orig_x; curr_x < x_bound; ++curr_x) {
            pixel_sum += *p++;
          }
        }

        setPixel(x, y, pixel_sum / pixels_per_block);
      }
    }
  }

  // Naive downsampler that reduces image size by factor by averaging pixels in
  // blocks of size factor x factor.
  void downsampleAveraged(const Image<T>& original, const int32 factor) {
    downsampleAveraged(original.getPixelPtr(0, 0), original.getWidth(), factor);
  }

  // Relatively efficient downsampling of an image by a factor of two with a
  // low-pass 3x3 smoothing operation thrown in.
  void downsampleSmoothed3x3(const Image<T>& original) {
    for (int32 y = 0; y < height_; ++y) {
      const int32 orig_y = clip(2 * y, ZERO, original.height_less_one_);
      const int32 min_y = clip(orig_y - 1, ZERO, original.height_less_one_);
      const int32 max_y = clip(orig_y + 1, ZERO, original.height_less_one_);

      for (int32 x = 0; x < width_; ++x) {
        const int32 orig_x = clip(2 * x, ZERO, original.width_less_one_);
        const int32 min_x = clip(orig_x - 1, ZERO, original.width_less_one_);
        const int32 max_x = clip(orig_x + 1, ZERO, original.width_less_one_);

        // Center.
        int32 pixel_sum = original.getPixel(orig_x, orig_y) * 4;

        // Sides.
        pixel_sum += (original.getPixel(max_x, orig_y) +
                      original.getPixel(min_x, orig_y) +
                      original.getPixel(orig_x, max_y) +
                      original.getPixel(orig_x, min_y)) * 2;

        // Diagonals.
        pixel_sum += (original.getPixel(max_x, max_y) +
                      original.getPixel(min_x, max_y) +
                      original.getPixel(max_x, min_y) +
                      original.getPixel(min_x, min_y));

        const int32 pixel_val = pixel_sum>>4;  // 16

        //LOGV("Setting %d,%d to %d", col, row, pixel_val);

        setPixel(x, y, pixel_val);
      }
    }
  }

  // Relatively efficient downsampling of an image by a factor of two with a
  // low-pass 5x5 smoothing operation thrown in.
  void downsampleSmoothed5x5(const Image<T>& original) {
    const int32 max_x = original.width_less_one_;
    const int32 max_y = original.height_less_one_;

    // The JY Bouget paper on Lucas-Kanade recommends a
    // [1/16 1/4 3/8 1/4 1/16]^2 filter.
    // This works out to a [1 4 6 4 1]^2 / 256 array, precomputed below.
    static const int32 window_radius = 2;
    static const int32 window_size = window_radius*2 + 1;
    static const int32 window_weights[] = {1, 4, 6, 4,1,  // 16 +
                                           4,16,24,16,4,  // 64 +
                                           6,24,36,24,6,  // 96 +
                                           4,16,24,16,4,  // 64 +
                                           1, 4, 6, 4,1}; // 16 = 256

    // We'll multiply and sum with the the whole numbers first, then divide by
    // the total weight to normalize at the last moment.
    for (int32 y = 0; y < height_; ++y) {
      for (int32 x = 0; x < width_; ++x) {
        int32 pixel_sum = 0;

        const int32* w = window_weights;
        const int32 start_x = clip((x<<1) - window_radius, ZERO, max_x);

        // Clip the boundaries to the size of the image.
        for (int32 window_y = 0; window_y < window_size; ++window_y) {
          const int32 start_y =
              clip((y<<1) - window_radius + window_y, ZERO, max_y);

          const T* p = original.getPixelPtrConst(start_x, start_y);

          for (int32 window_x = 0; window_x < window_size; ++window_x) {
            pixel_sum +=  *p++ * *w++;
          }
        }

        // Conversion to type T will happen here after shifting right 8 bits to
        // divide by 256.
        setPixel(x, y, pixel_sum >> 8);
      }
    }
  }

  // Optimized Scharr filter on a single pixel in the X direction.
  // Scharr filters are like central-difference operators, but have more
  // rotational symmetry in their response because they also consider the
  // diagonal neighbors.
  template <typename U>
  inline T scharrPixelX(const Image<U>& original,
                        const int32 center_x, const int32 center_y) const {
    const int32 min_x = clip(center_x - 1, ZERO, original.width_less_one_);
    const int32 max_x = clip(center_x + 1, ZERO, original.width_less_one_);
    const int32 min_y = clip(center_y - 1, ZERO, original.height_less_one_);
    const int32 max_y = clip(center_y + 1, ZERO, original.height_less_one_);

    // Convolution loop unrolled for performance...
    return (3 * (original.getPixel(max_x, min_y)
                 + original.getPixel(max_x, max_y)
                 - original.getPixel(min_x, min_y)
                 - original.getPixel(min_x, min_y))
            + 10 * (original.getPixel(max_x, center_y)
                    - original.getPixel(min_x, center_y))) / 32;
  }

  // Optimized Scharr filter on a single pixel in the X direction.
  // Scharr filters are like central-difference operators, but have more
  // rotational symmetry in their response because they also consider the
  // diagonal neighbors.
  template <typename U>
  inline T scharrPixelY(const Image<U>& original,
                        const int32 center_x, const int32 center_y) const {
    const int32 min_x = clip(center_x - 1, 0, original.width_less_one_);
    const int32 max_x = clip(center_x + 1, 0, original.width_less_one_);
    const int32 min_y = clip(center_y - 1, 0, original.height_less_one_);
    const int32 max_y = clip(center_y + 1, 0, original.height_less_one_);

    // Convolution loop unrolled for performance...
    return (3 * (original.getPixel(min_x, max_y)
                 + original.getPixel(max_x, max_y)
                 - original.getPixel(min_x, min_y)
                 - original.getPixel(max_x, min_y))
            + 10 * (original.getPixel(center_x, max_y)
                    - original.getPixel(center_x, min_y))) / 32;
  }

  // Convolve the image with a Scharr filter in the X direction.
  // Much faster than an equivalent generic convolution.
  template <typename U>
  inline void scharrX(const Image<U>& original) {
    for (int32 y = 0; y < height_; ++y) {
      for (int32 x = 0; x < width_; ++x) {
        setPixel(x, y, scharrPixelX(original, x, y));
      }
    }
  }

  // Convolve the image with a Scharr filter in the Y direction.
  // Much faster than an equivalent generic convolution.
  template <typename U>
  inline void scharrY(const Image<U>& original) {
    for (int32 y = 0; y < height_; ++y) {
      for (int32 x = 0; x < width_; ++x) {
        setPixel(x, y, scharrPixelY(original, x, y));
      }
    }
  }

  static inline T halfDiff(int32 first, int32 second) {
    return (second - first) / 2;
  }

  template <typename U>
  void derivativeX(const Image<U>& original) {
    for (int32 y = 0; y < height_; ++y) {
      T* const dest_row = getPixelPtr(0, y);
      const U* const source_row = original.getPixelPtrConst(0, y);

      // Compute first pixel.
      dest_row[0] = halfDiff(source_row[0], source_row[1]);

      // Last pixel.
      dest_row[width_less_one_] = halfDiff(source_row[width_less_one_ - 1],
                                           source_row[width_less_one_]);

      // All the pixels in between.
      const U* const source_prev_pixel = source_row - 1;
      const U* const source_next_pixel = source_row + 1;
      for (int32 x = 1; x < width_less_one_; ++x) {
        dest_row[x] = halfDiff(source_prev_pixel[x], source_next_pixel[x]);
      }
    }
  }

  template <typename U>
  void derivativeY(const Image<U>& original) {
    for (int32 y = 0; y < height_; ++y) {
      T* const dest_row = getPixelPtr(0, y);

      const U* const source_prev_pixel =
          original.getPixelPtrConst(0, max(0, y - 1));

      const U* const source_next_pixel =
          original.getPixelPtrConst(0, min(height_less_one_, y + 1));

      for (int32 x = 0; x < width_; ++x) {
        dest_row[x] = halfDiff(source_prev_pixel[x], source_next_pixel[x]);
      }
    }
  }

  // Generic function for convolving pixel with 3x3 filter.
  // Filter pixels should be in row major order.
  template <typename U>
  inline T convolvePixel3x3(const Image<U>& original,
                            const int32* const filter,
                            const int32 center_x, const int32 center_y,
                            const int32 total) const {
    int32 sum = 0;
    for (int32 filter_y = 0; filter_y < 3; ++filter_y) {
      const int32 y = clip(center_y - 1 + filter_y, 0, original.getHeight());
      for (int32 filter_x = 0; filter_x < 3; ++filter_x) {
        const int32 x = clip(center_x - 1 + filter_x, 0, original.getWidth());
        sum += original.getPixel(x, y) * filter[filter_y * 3 + filter_x];
      }
    }
    return sum / total;
  }

  // Generic function for convolving an image with a 3x3 filter.
  // TODO(andrewharp): Generalize this for any size filter.
  template <typename U>
  inline void convolve3x3(const Image<U>& original,
                          const int32* const filter) {
    int32 sum = 0;
    for (int32 i = 0; i < 9; ++i) {
      sum += abs(filter[i]);
    }
    for (int32 y = 0; y < height_; ++y) {
      for (int32 x = 0; x < width_; ++x) {
        setPixel(x, y, convolvePixel3x3(original, filter, x, y, sum));
      }
    }
  }

  // Load this image's data from a data array. The data at pixels is assumed to
  // have dimensions equivalent to this image's dimensions * factor.
  inline void fromArray(const T* const pixels, const int32 stride,
                        const int32 factor) {
    if (factor == 1) {
      // If not subsampling, memcpy per line should be faster.
      memcpy(this->image_data_, pixels, num_pixels_ * sizeof(T));
      return;
    }

    downsampleAveraged(pixels, stride, factor);
  }

  // Copy the image back out to an appropriately sized data array.
  inline void toArray(T* const pixels) const {
    // If not subsampling, memcpy should be faster.
    memcpy(pixels, this->image_data_, num_pixels_ * sizeof(T));
  }

 private:
  inline void allocate() {
    image_data_ = (T*)malloc(num_pixels_ * sizeof(T));
    if (image_data_ == NULL) {
      LOGE("Couldn't allocate image data!");
    }
  }

  T* image_data_;
  int32 width_;
  int32 height_;

 public:
  // Precompute these for efficiency's sake as they're used by a lot of
  // clipping code and loop code.
  // TODO(andrewharp): make these only accessible by other Images.
  int32 width_less_one_;
  int32 height_less_one_;
  int32 num_pixels_;
};


// Create a pyramid of downsampled images. The first level of the pyramid is the
// original image.
inline void computeSmoothedPyramid(const Image<uint8>& frame,
                                   const int32 num_levels,
                                   Image<uint8>** const pyramid) {
  // TODO(andrewharp): Find a const correct solution to this...
  // Maybe make an pyramid class with the first level of this pyramid as a
  // separate pointer?

  // Cast away const, but we're not going to hurt it, honest!
  pyramid[0] = const_cast<Image<uint8>*>(&frame);

  for (int32 l = 1; l < num_levels; ++l) {
    pyramid[l]->downsampleSmoothed3x3(*pyramid[l - 1]);
  }
}


// Create a spatial derivative pyramid based on a downsampled pyramid.
inline void computeSpatialPyramid(const Image<uint8>** const pyramid,
                                  const int32 num_levels,
                                  Image<int32>** pyramid_x,
                                  Image<int32>** pyramid_y) {
  for (int32 l = 0; l < num_levels; ++l) {
    const Image<uint8>& frame = *pyramid[l];

    // Fast convolutions to find spatial derivatives.
    pyramid_x[l]->derivativeX(frame);
    pyramid_y[l]->derivativeY(frame);
  }
}


// Marks a circle of a given radius on the boolean image.
// If the center spot is already marked, don't do anything and return false.
// Otherwise, mark everything in range true and return true.
template <typename U>
inline static bool markImage(const int32 x, const int32 y,
                             const int32 radius,
                             Image<U>* const img) {
  if (img->getPixel(x, y)) {
    // Already claimed, sorry.
    return false;
  }

  const int32 squared_radius = square(radius);

  for (int32 d_y = 0; d_y < radius; ++d_y) {
    const int32 squared_y_dist = square(d_y);

    const int32 min_y = y > d_y ? y - d_y : ZERO;
    const int32 max_y = min(y + d_y, img->height_less_one_);

    for (int32 d_x = 0; d_x < radius; ++d_x) {
      if (squared_y_dist + square(d_x) <= squared_radius) {
        const int32 min_x = x > d_x ? x - d_x : ZERO;
        const int32 max_x = min(x + d_x, img->width_less_one_);

        // Mark all four quadrants.
        img->setPixel(max_x, max_y, true);
        img->setPixel(max_x, min_y, true);
        img->setPixel(min_x, max_y, true);
        img->setPixel(min_x, min_y, true);
      } else {
        // Once we're too far out, we're not coming back in.
        break;
      }
    }
  }
  return true;
}


// Puts the image gradient matrix about a pixel into the 2x2 float array G.
// vals_x should be an array of the window x gradient values, whose indices
// can be in any order but are parallel to the vals_y entries.
// See http://robots.stanford.edu/cs223b04/algo_tracking.pdf for more details.
inline void calculateG(const float32* const vals_x, const float32* const vals_y,
                       const int32 num_vals, float* const G) {
  // Defined here because we want to keep track of how many values were
  // processed by NEON, so that we can finish off the remainder the normal
  // way.
  int32 i = 0;

#ifdef HAVE_ARMEABI_V7A
  if (supportsNeon()) {
    const float32_t* const arm_vals_x = (const float32_t*) vals_x;
    const float32_t* const arm_vals_y = (const float32_t*) vals_y;

    // Running sums.
    float32x4_t xx = vdupq_n_f32(0.0f);
    float32x4_t xy = vdupq_n_f32(0.0f);
    float32x4_t yy = vdupq_n_f32(0.0f);

    // Maximum index we can load 4 consecutive values from.
    // e.g. if there are 81 values, our last full pass can be from index 77:
    // 81-4=>77 (77, 78, 79, 80)
    const int32 max_i = num_vals - 4;

    // Process values 4 at a time, accumulating the sums of
    // the pixel-wise x*x, x*y, and y*y values.
    for (; i <= max_i; i += 4) {
      // Load xs
      float32x4_t x = vld1q_f32(arm_vals_x + i);

      // Multiply x*x and accumulate.
      xx = vmlaq_f32(xx, x, x);

      // Load ys
      float32x4_t y = vld1q_f32(arm_vals_y + i);

      // Multiply x*y and accumulate.
      xy = vmlaq_f32(xy, x, y);

      // Multiply y*y and accumulate.
      yy = vmlaq_f32(yy, y, y);
    }

    static float32_t xx_vals[4];
    static float32_t xy_vals[4];
    static float32_t yy_vals[4];

    vst1q_f32(xx_vals, xx);
    vst1q_f32(xy_vals, xy);
    vst1q_f32(yy_vals, yy);

    // Accumulated values are store in sets of 4, we have to manually add
    // the last bits together.
    for (int32 j = 0; j < 4; ++j) {
      G[0] += xx_vals[j];
      G[1] += xy_vals[j];
      G[3] += yy_vals[j];
    }
  }
#endif
  // Non-accelerated version, also finishes off last few values (< 4) from
  // above.
  for (; i < num_vals; ++i) {
    G[0] += square(vals_x[i]);
    G[1] += vals_x[i] * vals_y[i];
    G[3] += square(vals_y[i]);
  }

  // The matrix is symmetric, so this is a given.
  G[2] = G[1];
}


// Puts the image gradient matrix about a pixel into the 2x2 float array G.
// Looks up interpolated pixels, then calls above method for implementation.
inline void calculateG(const int window_size,
                       const float32 center_x, const float center_y,
                       const Image<int32>& I_x, const Image<int32>& I_y,
                       float* const G) {
  CHECK(I_x.validPixel(center_x, center_y), "Problem in calculateG!");

  // Hardcoded to allow for a max window radius of 5 (9 pixels x 9 pixels).
  static const int kMaxWindowRadius = 5;
  CHECK(window_size <= kMaxWindowRadius,
        "Window %d > %d!", window_size, kMaxWindowRadius);

  // Diameter of window is 2 * radius + 1 for center pixel.
  static const int kWindowBufferSize =
      (kMaxWindowRadius * 2 + 1) * (kMaxWindowRadius * 2 + 1);

  // Preallocate buffers statically for efficiency.
  static float32 vals_x[kWindowBufferSize];
  static float32 vals_y[kWindowBufferSize];

  int32 num_vals = 0;

  for (int32 win_x = -window_size; win_x <= window_size; ++win_x) {
    for (int32 win_y = -window_size; win_y <= window_size; ++win_y) {
      vals_x[num_vals] = I_x.getPixelInterp(center_x + win_x,
                                            center_y + win_y);
      vals_y[num_vals] = I_y.getPixelInterp(center_x + win_x,
                                            center_y + win_y);
      ++num_vals;
    }
  }
  calculateG(vals_x, vals_y, num_vals, G);
}

}  // namespace flow

#endif  // JAVA_COM_GOOGLE_ANDROID_APPS_UNVEIL_JNI_OPTICALFLOW_IMAGE_H_
Summary ✨

This C header file provides an interface for optical flow processing, a technique used to estimate the motion of pixels between two consecutive frames in a video sequence. It defines functions and data structures for image processing, including gradient calculation, interpolation, and buffer management, which can be used by applications such as Android apps.
Alerts (3)

Complexity hotspot; lines 287 to 289 (total complexity: 3)
287 288 289