/ocr/ocrservice/jni/opticalflow/image.h

http://eyes-free.googlecode.com/ · C Header · 779 lines · 474 code · 132 blank · 173 comment · 51 complexity · 30b321ae7bed2aab764ce116348a8428 MD5 · raw file

  1. // Copyright 2009 Google Inc. All Rights Reserved.
  2. // Author: andrewharp@google.com (Andrew Harp)
  3. #ifndef JAVA_COM_GOOGLE_ANDROID_APPS_UNVEIL_JNI_OPTICALFLOW_IMAGE_H_
  4. #define JAVA_COM_GOOGLE_ANDROID_APPS_UNVEIL_JNI_OPTICALFLOW_IMAGE_H_
  5. #include "optical_flow_utils.h"
  6. // TODO(andrewharp): Make this a cast to uint32 if/when we go unsigned for
  7. // operations.
  8. #define ZERO 0
  9. #ifdef SANITY_CHECKS
  10. #define CHECK_PIXEL(IMAGE, X, Y) {\
  11. CHECK((IMAGE)->validPixel((X), (Y)), \
  12. "CHECK_PIXEL(%d,%d) in %dx%d image.", \
  13. static_cast<int32>(X), static_cast<int32>(Y), \
  14. (IMAGE)->getWidth(), (IMAGE)->getHeight());\
  15. }
  16. #define CHECK_PIXEL_INTERP(IMAGE, X, Y) {\
  17. CHECK((IMAGE)->validInterpPixel((X), (Y)), \
  18. "CHECK_PIXEL_INTERP(%.2f, %.2f) in %dx%d image.", \
  19. static_cast<float32>(X), static_cast<float32>(Y), \
  20. (IMAGE)->getWidth(), (IMAGE)->getHeight());\
  21. }
  22. #else
  23. #define CHECK_PIXEL(image, x, y) {}
  24. #define CHECK_PIXEL_INTERP(IMAGE, X, Y) {}
  25. #endif
  26. namespace flow {
  27. // TODO(andrewharp): Make explicit which operations support negative numbers or
  28. // struct/class types in image data (possibly create fast multi-dim array class
  29. // for data where pixel arithmetic does not make sense).
  30. // Image class optimized for working on numeric arrays as grayscale image data.
  31. // Supports other data types as a 2D array class, so long as no pixel math
  32. // operations are called (convolution, downsampling, etc).
  33. template <typename T>
  34. class Image {
  35. public:
  36. Image(const int32 width, const int32 height) :
  37. width_(width),
  38. height_(height),
  39. width_less_one_(width_ - 1),
  40. height_less_one_(height_ - 1),
  41. num_pixels_(width_ * height_) {
  42. allocate();
  43. }
  44. explicit Image(const Size& size) :
  45. width_(size.width),
  46. height_(size.height),
  47. width_less_one_(width_ - 1),
  48. height_less_one_(height_ - 1),
  49. num_pixels_(width_ * height_) {
  50. allocate();
  51. }
  52. // Constructor that creates an image from preallocated data.
  53. // Note: The image takes ownership of the data.
  54. Image(const int32 width, const int32 height, T* const image) :
  55. width_(width),
  56. height_(height),
  57. width_less_one_(width_ - 1),
  58. height_less_one_(height_ - 1),
  59. num_pixels_(width_ * height_) {
  60. image_data_ = image;
  61. if (image_data_ == NULL) {
  62. LOGE("Can't create image with NULL data!");
  63. }
  64. }
  65. ~Image() {
  66. free(image_data_);
  67. }
  68. inline int32 getWidth() const { return width_; }
  69. inline int32 getHeight() const { return height_; }
  70. // Bilinearly sample a value between pixels.
  71. // Values outside of the image are sampled from the nearest edge of the image.
  72. inline float32 getPixelInterp(const float32 x, const float32 y) const {
  73. // Do int32 conversion one time.
  74. const int32 floored_x = (int32) x;
  75. const int32 floored_y = (int32) y;
  76. // Note: it might be the case that the *_[min|max] values are clipped, and
  77. // these (the a b c d vals) aren't (for speed purposes), but that doesn't
  78. // matter. We'll just be blending the pixel with itself in that case anyway.
  79. const float32 b = x - floored_x;
  80. const float32 a = 1.0f - b;
  81. const float32 d = y - floored_y;
  82. const float32 c = 1.0f - d;
  83. CHECK(validInterpPixel(x, y),
  84. "x or y out of bounds! %.2f [0 - %d), %.2f [0 - %d)",
  85. x, width_less_one_, y, height_less_one_);
  86. const T* const pix_ptr = getPixelPtrConst(floored_x, floored_y);
  87. // Experimental NEON acceleration... not to be turned on until it's faster.
  88. #if FALSE
  89. #ifdef HAVE_ARMEABI_V7A
  90. if (supportsNeon()) {
  91. // Output value:
  92. // a * c * p1 +
  93. // b * c * p2 +
  94. // a * d * p3 +
  95. // b * d * p4
  96. const float32x2_t ab = {a, b};
  97. const float32x4_t ab_c_ab_d = vcombine_f32(vmul_n_f32(ab, c),
  98. vmul_n_f32(ab, d));
  99. const float32x4_t p1p2p3p4 = {pix_ptr[0], pix_ptr[1],
  100. pix_ptr[width_], pix_ptr[width_ + 1]};
  101. float32x4_t almost = vmulq_f32(ab_c_ab_d, p1p2p3p4);
  102. // Butterfly-ish sum.
  103. almost = vaddq_f32(vrev64q_f32(almost), almost);
  104. return vgetq_lane_f32(almost, 0) + vgetq_lane_f32(almost, 1);
  105. }
  106. #endif
  107. #endif
  108. // Get the pixel values surrounding this point.
  109. const T& p1 = pix_ptr[0];
  110. const T& p2 = pix_ptr[1];
  111. const T& p3 = pix_ptr[width_];
  112. const T& p4 = pix_ptr[width_ + 1];
  113. // Simple bilinear interpolation between four reference pixels.
  114. // If x is the value requested:
  115. // a b
  116. // -------
  117. // c |p1 p2|
  118. // | x |
  119. // d |p3 p4|
  120. // -------
  121. return c * ((a * p1) + (b * p2)) +
  122. d * ((a * p3) + (b * p4));
  123. }
  124. // Returns true iff the pixel is in the image's boundaries.
  125. inline bool validPixel(const int32 x, const int32 y) const {
  126. return inRange(x, ZERO, width_less_one_) &&
  127. inRange(y, ZERO, height_less_one_);
  128. }
  129. // Returns true iff the pixel is in the image's boundaries for interpolation
  130. // purposes.
  131. // TODO(andrewharp): check in interpolation follow-up change.
  132. inline bool validInterpPixel(const float32 x, const float32 y) const {
  133. // Exclusive of max because we can be more efficient if we don't handle
  134. // interpolating on or past the last pixel.
  135. return (x >= ZERO) && (x < width_less_one_) &&
  136. (y >= ZERO) && (y < height_less_one_);
  137. }
  138. // Safe lookup with boundary enforcement.
  139. inline T getPixelClipped(const int32 x, const int32 y) const {
  140. return getPixel(clip(x, ZERO, width_less_one_),
  141. clip(y, ZERO, height_less_one_));
  142. }
  143. // Returns a const pointer to the pixel in question.
  144. inline const T* getPixelPtrConst(const int32 x, const int32 y) const {
  145. CHECK_PIXEL(this, x, y);
  146. return image_data_ + y * width_ + x;
  147. }
  148. // Returns a pointer to the pixel in question.
  149. inline T* getPixelPtr(const int32 x, const int32 y) const {
  150. CHECK_PIXEL(this, x, y);
  151. return image_data_ + y * width_ + x;
  152. }
  153. // Fast lookup without boundary enforcement.
  154. inline const T getPixel(const int32 x, const int32 y) const {
  155. CHECK_PIXEL(this, x, y);
  156. return image_data_[y * width_ + x];
  157. }
  158. // Fast setting without boundary enforcement.
  159. inline void setPixel(const int32 x, const int32 y, const T& val) {
  160. CHECK_PIXEL(this, x, y);
  161. image_data_[y * width_ + x] = val;
  162. }
  163. // Clears image to a single value.
  164. inline void clear(const T& val) {
  165. memset(image_data_, val, sizeof(*image_data_) * num_pixels_);
  166. }
  167. #ifdef HAVE_ARMEABI_V7A
  168. // This function does the bulk of the work.
  169. inline void downsample32ColumnsNeon(const uint8* const original,
  170. const int32 stride,
  171. const int32 orig_x) {
  172. // Divide input x offset by 4 to find output offset.
  173. const int32 new_x = orig_x >> 2;
  174. // Initial offset into top row.
  175. const uint8* offset = original + orig_x;
  176. // Sum along vertical columns.
  177. // Process 32x4 input pixels and 8x1 output pixels per iteration.
  178. for (int32 new_y = 0; new_y < height_; ++new_y) {
  179. uint16x8_t accum1 = vdupq_n_u16(0);
  180. uint16x8_t accum2 = vdupq_n_u16(0);
  181. // Go top to bottom across the four rows of input pixels that make up
  182. // this output row.
  183. for (int32 row_num = 0; row_num < 4; ++row_num) {
  184. // First 16 bytes.
  185. {
  186. // Load 32 bytes of data from current offset.
  187. const uint8x16_t curr_data1 = vld1q_u8(offset);
  188. // Pairwise add and accumulate into accum vectors (16 bit to account
  189. // for values above 255).
  190. accum1 = vpadalq_u8(accum1, curr_data1);
  191. }
  192. // Second 16 bytes.
  193. {
  194. // Load 32 bytes of data from current offset.
  195. const uint8x16_t curr_data2 = vld1q_u8(offset + 16);
  196. // Pairwise add and accumulate into accum vectors (16 bit to account
  197. // for values above 255).
  198. accum2 = vpadalq_u8(accum2, curr_data2);
  199. }
  200. // Move offset down one row.
  201. offset += stride;
  202. }
  203. // Add and widen, then divide by 16 (number of input pixels per output
  204. // pixel) and narrow data from 32 bits per pixel to 16 bpp.
  205. const uint16x4_t tmp_pix1 = vqshrn_n_u32(vpaddlq_u16(accum1), 4);
  206. const uint16x4_t tmp_pix2 = vqshrn_n_u32(vpaddlq_u16(accum2), 4);
  207. // Combine 4x1 pixel strips into 8x1 pixel strip and narrow from
  208. // 16 bits to 8 bits per pixel.
  209. const uint8x8_t allpixels = vmovn_u16(vcombine_u16(tmp_pix1, tmp_pix2));
  210. // This points to the leftmost pixel of our 8 horizontally arranged
  211. // pixels in the destination data.
  212. uint8* const ptr_dst = getPixelPtr(new_x, new_y);
  213. // Copy all pixels from composite 8x1 vector into output strip.
  214. vst1_u8(ptr_dst, allpixels);
  215. }
  216. }
  217. // Hardware accelerated downsampling method for supported devices.
  218. // Requires that image size be a multiple of 16 pixels in each dimension,
  219. // and that downsampling be by a factor of 4.
  220. void downsampleAveragedNeon(const uint8* const original,
  221. const int32 stride) {
  222. // Hardcoded to only work on 4x downsampling.
  223. const int32 orig_width = width_ * 4;
  224. // We process 32 input pixels lengthwise at a time.
  225. // The output per pass of this loop is an 8 wide by 1 tall pixel strip.
  226. for (int32 orig_x = 0; orig_x < orig_width; orig_x += 32) {
  227. // Push it to the left enough so that it never goes out of bounds.
  228. // This will result in some extra computation on the last pass on
  229. // devices whose frame widths are not multiples of 32.
  230. downsample32ColumnsNeon(original, stride, min(orig_x, orig_width - 32));
  231. }
  232. }
  233. #endif
  234. // Naive downsampler that reduces image size by factor by averaging pixels in
  235. // blocks of size factor x factor.
  236. void downsampleAveraged(const T* const original, const int32 stride,
  237. const int32 factor) {
  238. #ifdef HAVE_ARMEABI_V7A
  239. if (supportsNeon() &&
  240. factor == 4 &&
  241. (height_ % 4) == 0) {
  242. downsampleAveragedNeon(original, stride);
  243. return;
  244. }
  245. #endif
  246. const int32 pixels_per_block = factor * factor;
  247. // For every pixel in resulting image.
  248. for (int32 y = 0; y < height_; ++y) {
  249. const int32 orig_y = y * factor;
  250. const int32 y_bound = orig_y + factor;
  251. // Sum up the original pixels.
  252. for (int32 x = 0; x < width_; ++x) {
  253. const int32 orig_x = x * factor;
  254. const int32 x_bound = orig_x + factor;
  255. // Making this int32 because type U or T might overflow.
  256. int32 pixel_sum = 0;
  257. // Grab all the pixels that make up this pixel.
  258. for (int32 curr_y = orig_y; curr_y < y_bound; ++curr_y) {
  259. const T* p = original + curr_y * stride + orig_x;
  260. for (int32 curr_x = orig_x; curr_x < x_bound; ++curr_x) {
  261. pixel_sum += *p++;
  262. }
  263. }
  264. setPixel(x, y, pixel_sum / pixels_per_block);
  265. }
  266. }
  267. }
  268. // Naive downsampler that reduces image size by factor by averaging pixels in
  269. // blocks of size factor x factor.
  270. void downsampleAveraged(const Image<T>& original, const int32 factor) {
  271. downsampleAveraged(original.getPixelPtr(0, 0), original.getWidth(), factor);
  272. }
  273. // Relatively efficient downsampling of an image by a factor of two with a
  274. // low-pass 3x3 smoothing operation thrown in.
  275. void downsampleSmoothed3x3(const Image<T>& original) {
  276. for (int32 y = 0; y < height_; ++y) {
  277. const int32 orig_y = clip(2 * y, ZERO, original.height_less_one_);
  278. const int32 min_y = clip(orig_y - 1, ZERO, original.height_less_one_);
  279. const int32 max_y = clip(orig_y + 1, ZERO, original.height_less_one_);
  280. for (int32 x = 0; x < width_; ++x) {
  281. const int32 orig_x = clip(2 * x, ZERO, original.width_less_one_);
  282. const int32 min_x = clip(orig_x - 1, ZERO, original.width_less_one_);
  283. const int32 max_x = clip(orig_x + 1, ZERO, original.width_less_one_);
  284. // Center.
  285. int32 pixel_sum = original.getPixel(orig_x, orig_y) * 4;
  286. // Sides.
  287. pixel_sum += (original.getPixel(max_x, orig_y) +
  288. original.getPixel(min_x, orig_y) +
  289. original.getPixel(orig_x, max_y) +
  290. original.getPixel(orig_x, min_y)) * 2;
  291. // Diagonals.
  292. pixel_sum += (original.getPixel(max_x, max_y) +
  293. original.getPixel(min_x, max_y) +
  294. original.getPixel(max_x, min_y) +
  295. original.getPixel(min_x, min_y));
  296. const int32 pixel_val = pixel_sum>>4; // 16
  297. //LOGV("Setting %d,%d to %d", col, row, pixel_val);
  298. setPixel(x, y, pixel_val);
  299. }
  300. }
  301. }
  302. // Relatively efficient downsampling of an image by a factor of two with a
  303. // low-pass 5x5 smoothing operation thrown in.
  304. void downsampleSmoothed5x5(const Image<T>& original) {
  305. const int32 max_x = original.width_less_one_;
  306. const int32 max_y = original.height_less_one_;
  307. // The JY Bouget paper on Lucas-Kanade recommends a
  308. // [1/16 1/4 3/8 1/4 1/16]^2 filter.
  309. // This works out to a [1 4 6 4 1]^2 / 256 array, precomputed below.
  310. static const int32 window_radius = 2;
  311. static const int32 window_size = window_radius*2 + 1;
  312. static const int32 window_weights[] = {1, 4, 6, 4,1, // 16 +
  313. 4,16,24,16,4, // 64 +
  314. 6,24,36,24,6, // 96 +
  315. 4,16,24,16,4, // 64 +
  316. 1, 4, 6, 4,1}; // 16 = 256
  317. // We'll multiply and sum with the the whole numbers first, then divide by
  318. // the total weight to normalize at the last moment.
  319. for (int32 y = 0; y < height_; ++y) {
  320. for (int32 x = 0; x < width_; ++x) {
  321. int32 pixel_sum = 0;
  322. const int32* w = window_weights;
  323. const int32 start_x = clip((x<<1) - window_radius, ZERO, max_x);
  324. // Clip the boundaries to the size of the image.
  325. for (int32 window_y = 0; window_y < window_size; ++window_y) {
  326. const int32 start_y =
  327. clip((y<<1) - window_radius + window_y, ZERO, max_y);
  328. const T* p = original.getPixelPtrConst(start_x, start_y);
  329. for (int32 window_x = 0; window_x < window_size; ++window_x) {
  330. pixel_sum += *p++ * *w++;
  331. }
  332. }
  333. // Conversion to type T will happen here after shifting right 8 bits to
  334. // divide by 256.
  335. setPixel(x, y, pixel_sum >> 8);
  336. }
  337. }
  338. }
  339. // Optimized Scharr filter on a single pixel in the X direction.
  340. // Scharr filters are like central-difference operators, but have more
  341. // rotational symmetry in their response because they also consider the
  342. // diagonal neighbors.
  343. template <typename U>
  344. inline T scharrPixelX(const Image<U>& original,
  345. const int32 center_x, const int32 center_y) const {
  346. const int32 min_x = clip(center_x - 1, ZERO, original.width_less_one_);
  347. const int32 max_x = clip(center_x + 1, ZERO, original.width_less_one_);
  348. const int32 min_y = clip(center_y - 1, ZERO, original.height_less_one_);
  349. const int32 max_y = clip(center_y + 1, ZERO, original.height_less_one_);
  350. // Convolution loop unrolled for performance...
  351. return (3 * (original.getPixel(max_x, min_y)
  352. + original.getPixel(max_x, max_y)
  353. - original.getPixel(min_x, min_y)
  354. - original.getPixel(min_x, min_y))
  355. + 10 * (original.getPixel(max_x, center_y)
  356. - original.getPixel(min_x, center_y))) / 32;
  357. }
  358. // Optimized Scharr filter on a single pixel in the X direction.
  359. // Scharr filters are like central-difference operators, but have more
  360. // rotational symmetry in their response because they also consider the
  361. // diagonal neighbors.
  362. template <typename U>
  363. inline T scharrPixelY(const Image<U>& original,
  364. const int32 center_x, const int32 center_y) const {
  365. const int32 min_x = clip(center_x - 1, 0, original.width_less_one_);
  366. const int32 max_x = clip(center_x + 1, 0, original.width_less_one_);
  367. const int32 min_y = clip(center_y - 1, 0, original.height_less_one_);
  368. const int32 max_y = clip(center_y + 1, 0, original.height_less_one_);
  369. // Convolution loop unrolled for performance...
  370. return (3 * (original.getPixel(min_x, max_y)
  371. + original.getPixel(max_x, max_y)
  372. - original.getPixel(min_x, min_y)
  373. - original.getPixel(max_x, min_y))
  374. + 10 * (original.getPixel(center_x, max_y)
  375. - original.getPixel(center_x, min_y))) / 32;
  376. }
  377. // Convolve the image with a Scharr filter in the X direction.
  378. // Much faster than an equivalent generic convolution.
  379. template <typename U>
  380. inline void scharrX(const Image<U>& original) {
  381. for (int32 y = 0; y < height_; ++y) {
  382. for (int32 x = 0; x < width_; ++x) {
  383. setPixel(x, y, scharrPixelX(original, x, y));
  384. }
  385. }
  386. }
  387. // Convolve the image with a Scharr filter in the Y direction.
  388. // Much faster than an equivalent generic convolution.
  389. template <typename U>
  390. inline void scharrY(const Image<U>& original) {
  391. for (int32 y = 0; y < height_; ++y) {
  392. for (int32 x = 0; x < width_; ++x) {
  393. setPixel(x, y, scharrPixelY(original, x, y));
  394. }
  395. }
  396. }
  397. static inline T halfDiff(int32 first, int32 second) {
  398. return (second - first) / 2;
  399. }
  400. template <typename U>
  401. void derivativeX(const Image<U>& original) {
  402. for (int32 y = 0; y < height_; ++y) {
  403. T* const dest_row = getPixelPtr(0, y);
  404. const U* const source_row = original.getPixelPtrConst(0, y);
  405. // Compute first pixel.
  406. dest_row[0] = halfDiff(source_row[0], source_row[1]);
  407. // Last pixel.
  408. dest_row[width_less_one_] = halfDiff(source_row[width_less_one_ - 1],
  409. source_row[width_less_one_]);
  410. // All the pixels in between.
  411. const U* const source_prev_pixel = source_row - 1;
  412. const U* const source_next_pixel = source_row + 1;
  413. for (int32 x = 1; x < width_less_one_; ++x) {
  414. dest_row[x] = halfDiff(source_prev_pixel[x], source_next_pixel[x]);
  415. }
  416. }
  417. }
  418. template <typename U>
  419. void derivativeY(const Image<U>& original) {
  420. for (int32 y = 0; y < height_; ++y) {
  421. T* const dest_row = getPixelPtr(0, y);
  422. const U* const source_prev_pixel =
  423. original.getPixelPtrConst(0, max(0, y - 1));
  424. const U* const source_next_pixel =
  425. original.getPixelPtrConst(0, min(height_less_one_, y + 1));
  426. for (int32 x = 0; x < width_; ++x) {
  427. dest_row[x] = halfDiff(source_prev_pixel[x], source_next_pixel[x]);
  428. }
  429. }
  430. }
  431. // Generic function for convolving pixel with 3x3 filter.
  432. // Filter pixels should be in row major order.
  433. template <typename U>
  434. inline T convolvePixel3x3(const Image<U>& original,
  435. const int32* const filter,
  436. const int32 center_x, const int32 center_y,
  437. const int32 total) const {
  438. int32 sum = 0;
  439. for (int32 filter_y = 0; filter_y < 3; ++filter_y) {
  440. const int32 y = clip(center_y - 1 + filter_y, 0, original.getHeight());
  441. for (int32 filter_x = 0; filter_x < 3; ++filter_x) {
  442. const int32 x = clip(center_x - 1 + filter_x, 0, original.getWidth());
  443. sum += original.getPixel(x, y) * filter[filter_y * 3 + filter_x];
  444. }
  445. }
  446. return sum / total;
  447. }
  448. // Generic function for convolving an image with a 3x3 filter.
  449. // TODO(andrewharp): Generalize this for any size filter.
  450. template <typename U>
  451. inline void convolve3x3(const Image<U>& original,
  452. const int32* const filter) {
  453. int32 sum = 0;
  454. for (int32 i = 0; i < 9; ++i) {
  455. sum += abs(filter[i]);
  456. }
  457. for (int32 y = 0; y < height_; ++y) {
  458. for (int32 x = 0; x < width_; ++x) {
  459. setPixel(x, y, convolvePixel3x3(original, filter, x, y, sum));
  460. }
  461. }
  462. }
  463. // Load this image's data from a data array. The data at pixels is assumed to
  464. // have dimensions equivalent to this image's dimensions * factor.
  465. inline void fromArray(const T* const pixels, const int32 stride,
  466. const int32 factor) {
  467. if (factor == 1) {
  468. // If not subsampling, memcpy per line should be faster.
  469. memcpy(this->image_data_, pixels, num_pixels_ * sizeof(T));
  470. return;
  471. }
  472. downsampleAveraged(pixels, stride, factor);
  473. }
  474. // Copy the image back out to an appropriately sized data array.
  475. inline void toArray(T* const pixels) const {
  476. // If not subsampling, memcpy should be faster.
  477. memcpy(pixels, this->image_data_, num_pixels_ * sizeof(T));
  478. }
  479. private:
  480. inline void allocate() {
  481. image_data_ = (T*)malloc(num_pixels_ * sizeof(T));
  482. if (image_data_ == NULL) {
  483. LOGE("Couldn't allocate image data!");
  484. }
  485. }
  486. T* image_data_;
  487. int32 width_;
  488. int32 height_;
  489. public:
  490. // Precompute these for efficiency's sake as they're used by a lot of
  491. // clipping code and loop code.
  492. // TODO(andrewharp): make these only accessible by other Images.
  493. int32 width_less_one_;
  494. int32 height_less_one_;
  495. int32 num_pixels_;
  496. };
  497. // Create a pyramid of downsampled images. The first level of the pyramid is the
  498. // original image.
  499. inline void computeSmoothedPyramid(const Image<uint8>& frame,
  500. const int32 num_levels,
  501. Image<uint8>** const pyramid) {
  502. // TODO(andrewharp): Find a const correct solution to this...
  503. // Maybe make an pyramid class with the first level of this pyramid as a
  504. // separate pointer?
  505. // Cast away const, but we're not going to hurt it, honest!
  506. pyramid[0] = const_cast<Image<uint8>*>(&frame);
  507. for (int32 l = 1; l < num_levels; ++l) {
  508. pyramid[l]->downsampleSmoothed3x3(*pyramid[l - 1]);
  509. }
  510. }
  511. // Create a spatial derivative pyramid based on a downsampled pyramid.
  512. inline void computeSpatialPyramid(const Image<uint8>** const pyramid,
  513. const int32 num_levels,
  514. Image<int32>** pyramid_x,
  515. Image<int32>** pyramid_y) {
  516. for (int32 l = 0; l < num_levels; ++l) {
  517. const Image<uint8>& frame = *pyramid[l];
  518. // Fast convolutions to find spatial derivatives.
  519. pyramid_x[l]->derivativeX(frame);
  520. pyramid_y[l]->derivativeY(frame);
  521. }
  522. }
  523. // Marks a circle of a given radius on the boolean image.
  524. // If the center spot is already marked, don't do anything and return false.
  525. // Otherwise, mark everything in range true and return true.
  526. template <typename U>
  527. inline static bool markImage(const int32 x, const int32 y,
  528. const int32 radius,
  529. Image<U>* const img) {
  530. if (img->getPixel(x, y)) {
  531. // Already claimed, sorry.
  532. return false;
  533. }
  534. const int32 squared_radius = square(radius);
  535. for (int32 d_y = 0; d_y < radius; ++d_y) {
  536. const int32 squared_y_dist = square(d_y);
  537. const int32 min_y = y > d_y ? y - d_y : ZERO;
  538. const int32 max_y = min(y + d_y, img->height_less_one_);
  539. for (int32 d_x = 0; d_x < radius; ++d_x) {
  540. if (squared_y_dist + square(d_x) <= squared_radius) {
  541. const int32 min_x = x > d_x ? x - d_x : ZERO;
  542. const int32 max_x = min(x + d_x, img->width_less_one_);
  543. // Mark all four quadrants.
  544. img->setPixel(max_x, max_y, true);
  545. img->setPixel(max_x, min_y, true);
  546. img->setPixel(min_x, max_y, true);
  547. img->setPixel(min_x, min_y, true);
  548. } else {
  549. // Once we're too far out, we're not coming back in.
  550. break;
  551. }
  552. }
  553. }
  554. return true;
  555. }
  556. // Puts the image gradient matrix about a pixel into the 2x2 float array G.
  557. // vals_x should be an array of the window x gradient values, whose indices
  558. // can be in any order but are parallel to the vals_y entries.
  559. // See http://robots.stanford.edu/cs223b04/algo_tracking.pdf for more details.
  560. inline void calculateG(const float32* const vals_x, const float32* const vals_y,
  561. const int32 num_vals, float* const G) {
  562. // Defined here because we want to keep track of how many values were
  563. // processed by NEON, so that we can finish off the remainder the normal
  564. // way.
  565. int32 i = 0;
  566. #ifdef HAVE_ARMEABI_V7A
  567. if (supportsNeon()) {
  568. const float32_t* const arm_vals_x = (const float32_t*) vals_x;
  569. const float32_t* const arm_vals_y = (const float32_t*) vals_y;
  570. // Running sums.
  571. float32x4_t xx = vdupq_n_f32(0.0f);
  572. float32x4_t xy = vdupq_n_f32(0.0f);
  573. float32x4_t yy = vdupq_n_f32(0.0f);
  574. // Maximum index we can load 4 consecutive values from.
  575. // e.g. if there are 81 values, our last full pass can be from index 77:
  576. // 81-4=>77 (77, 78, 79, 80)
  577. const int32 max_i = num_vals - 4;
  578. // Process values 4 at a time, accumulating the sums of
  579. // the pixel-wise x*x, x*y, and y*y values.
  580. for (; i <= max_i; i += 4) {
  581. // Load xs
  582. float32x4_t x = vld1q_f32(arm_vals_x + i);
  583. // Multiply x*x and accumulate.
  584. xx = vmlaq_f32(xx, x, x);
  585. // Load ys
  586. float32x4_t y = vld1q_f32(arm_vals_y + i);
  587. // Multiply x*y and accumulate.
  588. xy = vmlaq_f32(xy, x, y);
  589. // Multiply y*y and accumulate.
  590. yy = vmlaq_f32(yy, y, y);
  591. }
  592. static float32_t xx_vals[4];
  593. static float32_t xy_vals[4];
  594. static float32_t yy_vals[4];
  595. vst1q_f32(xx_vals, xx);
  596. vst1q_f32(xy_vals, xy);
  597. vst1q_f32(yy_vals, yy);
  598. // Accumulated values are store in sets of 4, we have to manually add
  599. // the last bits together.
  600. for (int32 j = 0; j < 4; ++j) {
  601. G[0] += xx_vals[j];
  602. G[1] += xy_vals[j];
  603. G[3] += yy_vals[j];
  604. }
  605. }
  606. #endif
  607. // Non-accelerated version, also finishes off last few values (< 4) from
  608. // above.
  609. for (; i < num_vals; ++i) {
  610. G[0] += square(vals_x[i]);
  611. G[1] += vals_x[i] * vals_y[i];
  612. G[3] += square(vals_y[i]);
  613. }
  614. // The matrix is symmetric, so this is a given.
  615. G[2] = G[1];
  616. }
  617. // Puts the image gradient matrix about a pixel into the 2x2 float array G.
  618. // Looks up interpolated pixels, then calls above method for implementation.
  619. inline void calculateG(const int window_size,
  620. const float32 center_x, const float center_y,
  621. const Image<int32>& I_x, const Image<int32>& I_y,
  622. float* const G) {
  623. CHECK(I_x.validPixel(center_x, center_y), "Problem in calculateG!");
  624. // Hardcoded to allow for a max window radius of 5 (9 pixels x 9 pixels).
  625. static const int kMaxWindowRadius = 5;
  626. CHECK(window_size <= kMaxWindowRadius,
  627. "Window %d > %d!", window_size, kMaxWindowRadius);
  628. // Diameter of window is 2 * radius + 1 for center pixel.
  629. static const int kWindowBufferSize =
  630. (kMaxWindowRadius * 2 + 1) * (kMaxWindowRadius * 2 + 1);
  631. // Preallocate buffers statically for efficiency.
  632. static float32 vals_x[kWindowBufferSize];
  633. static float32 vals_y[kWindowBufferSize];
  634. int32 num_vals = 0;
  635. for (int32 win_x = -window_size; win_x <= window_size; ++win_x) {
  636. for (int32 win_y = -window_size; win_y <= window_size; ++win_y) {
  637. vals_x[num_vals] = I_x.getPixelInterp(center_x + win_x,
  638. center_y + win_y);
  639. vals_y[num_vals] = I_y.getPixelInterp(center_x + win_x,
  640. center_y + win_y);
  641. ++num_vals;
  642. }
  643. }
  644. calculateG(vals_x, vals_y, num_vals, G);
  645. }
  646. } // namespace flow
  647. #endif // JAVA_COM_GOOGLE_ANDROID_APPS_UNVEIL_JNI_OPTICALFLOW_IMAGE_H_