/ocr/ocrservice/jni/opticalflow/image.h
C++ Header | 779 lines | 474 code | 132 blank | 173 comment | 51 complexity | 30b321ae7bed2aab764ce116348a8428 MD5 | raw file
1// Copyright 2009 Google Inc. All Rights Reserved. 2// Author: andrewharp@google.com (Andrew Harp) 3 4#ifndef JAVA_COM_GOOGLE_ANDROID_APPS_UNVEIL_JNI_OPTICALFLOW_IMAGE_H_ 5#define JAVA_COM_GOOGLE_ANDROID_APPS_UNVEIL_JNI_OPTICALFLOW_IMAGE_H_ 6 7#include "optical_flow_utils.h" 8 9// TODO(andrewharp): Make this a cast to uint32 if/when we go unsigned for 10// operations. 11#define ZERO 0 12 13#ifdef SANITY_CHECKS 14 #define CHECK_PIXEL(IMAGE, X, Y) {\ 15 CHECK((IMAGE)->validPixel((X), (Y)), \ 16 "CHECK_PIXEL(%d,%d) in %dx%d image.", \ 17 static_cast<int32>(X), static_cast<int32>(Y), \ 18 (IMAGE)->getWidth(), (IMAGE)->getHeight());\ 19 } 20 21 #define CHECK_PIXEL_INTERP(IMAGE, X, Y) {\ 22 CHECK((IMAGE)->validInterpPixel((X), (Y)), \ 23 "CHECK_PIXEL_INTERP(%.2f, %.2f) in %dx%d image.", \ 24 static_cast<float32>(X), static_cast<float32>(Y), \ 25 (IMAGE)->getWidth(), (IMAGE)->getHeight());\ 26 } 27#else 28 #define CHECK_PIXEL(image, x, y) {} 29 #define CHECK_PIXEL_INTERP(IMAGE, X, Y) {} 30#endif 31 32namespace flow { 33 34// TODO(andrewharp): Make explicit which operations support negative numbers or 35// struct/class types in image data (possibly create fast multi-dim array class 36// for data where pixel arithmetic does not make sense). 37 38// Image class optimized for working on numeric arrays as grayscale image data. 39// Supports other data types as a 2D array class, so long as no pixel math 40// operations are called (convolution, downsampling, etc). 41template <typename T> 42class Image { 43 public: 44 Image(const int32 width, const int32 height) : 45 width_(width), 46 height_(height), 47 width_less_one_(width_ - 1), 48 height_less_one_(height_ - 1), 49 num_pixels_(width_ * height_) { 50 allocate(); 51 } 52 53 explicit Image(const Size& size) : 54 width_(size.width), 55 height_(size.height), 56 width_less_one_(width_ - 1), 57 height_less_one_(height_ - 1), 58 num_pixels_(width_ * height_) { 59 allocate(); 60 } 61 62 // Constructor that creates an image from preallocated data. 63 // Note: The image takes ownership of the data. 64 Image(const int32 width, const int32 height, T* const image) : 65 width_(width), 66 height_(height), 67 width_less_one_(width_ - 1), 68 height_less_one_(height_ - 1), 69 num_pixels_(width_ * height_) { 70 image_data_ = image; 71 if (image_data_ == NULL) { 72 LOGE("Can't create image with NULL data!"); 73 } 74 } 75 76 ~Image() { 77 free(image_data_); 78 } 79 80 inline int32 getWidth() const { return width_; } 81 inline int32 getHeight() const { return height_; } 82 83 // Bilinearly sample a value between pixels. 84 // Values outside of the image are sampled from the nearest edge of the image. 85 inline float32 getPixelInterp(const float32 x, const float32 y) const { 86 // Do int32 conversion one time. 87 const int32 floored_x = (int32) x; 88 const int32 floored_y = (int32) y; 89 90 // Note: it might be the case that the *_[min|max] values are clipped, and 91 // these (the a b c d vals) aren't (for speed purposes), but that doesn't 92 // matter. We'll just be blending the pixel with itself in that case anyway. 93 const float32 b = x - floored_x; 94 const float32 a = 1.0f - b; 95 96 const float32 d = y - floored_y; 97 const float32 c = 1.0f - d; 98 99 CHECK(validInterpPixel(x, y), 100 "x or y out of bounds! %.2f [0 - %d), %.2f [0 - %d)", 101 x, width_less_one_, y, height_less_one_); 102 103 const T* const pix_ptr = getPixelPtrConst(floored_x, floored_y); 104 105// Experimental NEON acceleration... not to be turned on until it's faster. 106#if FALSE 107#ifdef HAVE_ARMEABI_V7A 108 if (supportsNeon()) { 109 // Output value: 110 // a * c * p1 + 111 // b * c * p2 + 112 // a * d * p3 + 113 // b * d * p4 114 const float32x2_t ab = {a, b}; 115 const float32x4_t ab_c_ab_d = vcombine_f32(vmul_n_f32(ab, c), 116 vmul_n_f32(ab, d)); 117 118 const float32x4_t p1p2p3p4 = {pix_ptr[0], pix_ptr[1], 119 pix_ptr[width_], pix_ptr[width_ + 1]}; 120 121 float32x4_t almost = vmulq_f32(ab_c_ab_d, p1p2p3p4); 122 123 // Butterfly-ish sum. 124 almost = vaddq_f32(vrev64q_f32(almost), almost); 125 126 return vgetq_lane_f32(almost, 0) + vgetq_lane_f32(almost, 1); 127 } 128#endif 129#endif 130 131 // Get the pixel values surrounding this point. 132 const T& p1 = pix_ptr[0]; 133 const T& p2 = pix_ptr[1]; 134 const T& p3 = pix_ptr[width_]; 135 const T& p4 = pix_ptr[width_ + 1]; 136 137 // Simple bilinear interpolation between four reference pixels. 138 // If x is the value requested: 139 // a b 140 // ------- 141 // c |p1 p2| 142 // | x | 143 // d |p3 p4| 144 // ------- 145 return c * ((a * p1) + (b * p2)) + 146 d * ((a * p3) + (b * p4)); 147 } 148 149 // Returns true iff the pixel is in the image's boundaries. 150 inline bool validPixel(const int32 x, const int32 y) const { 151 return inRange(x, ZERO, width_less_one_) && 152 inRange(y, ZERO, height_less_one_); 153 } 154 155 // Returns true iff the pixel is in the image's boundaries for interpolation 156 // purposes. 157 // TODO(andrewharp): check in interpolation follow-up change. 158 inline bool validInterpPixel(const float32 x, const float32 y) const { 159 // Exclusive of max because we can be more efficient if we don't handle 160 // interpolating on or past the last pixel. 161 return (x >= ZERO) && (x < width_less_one_) && 162 (y >= ZERO) && (y < height_less_one_); 163 } 164 165 // Safe lookup with boundary enforcement. 166 inline T getPixelClipped(const int32 x, const int32 y) const { 167 return getPixel(clip(x, ZERO, width_less_one_), 168 clip(y, ZERO, height_less_one_)); 169 } 170 171 // Returns a const pointer to the pixel in question. 172 inline const T* getPixelPtrConst(const int32 x, const int32 y) const { 173 CHECK_PIXEL(this, x, y); 174 return image_data_ + y * width_ + x; 175 } 176 177 // Returns a pointer to the pixel in question. 178 inline T* getPixelPtr(const int32 x, const int32 y) const { 179 CHECK_PIXEL(this, x, y); 180 return image_data_ + y * width_ + x; 181 } 182 183 // Fast lookup without boundary enforcement. 184 inline const T getPixel(const int32 x, const int32 y) const { 185 CHECK_PIXEL(this, x, y); 186 return image_data_[y * width_ + x]; 187 } 188 189 // Fast setting without boundary enforcement. 190 inline void setPixel(const int32 x, const int32 y, const T& val) { 191 CHECK_PIXEL(this, x, y); 192 image_data_[y * width_ + x] = val; 193 } 194 195 // Clears image to a single value. 196 inline void clear(const T& val) { 197 memset(image_data_, val, sizeof(*image_data_) * num_pixels_); 198 } 199 200 201#ifdef HAVE_ARMEABI_V7A 202 // This function does the bulk of the work. 203 inline void downsample32ColumnsNeon(const uint8* const original, 204 const int32 stride, 205 const int32 orig_x) { 206 // Divide input x offset by 4 to find output offset. 207 const int32 new_x = orig_x >> 2; 208 209 // Initial offset into top row. 210 const uint8* offset = original + orig_x; 211 212 // Sum along vertical columns. 213 // Process 32x4 input pixels and 8x1 output pixels per iteration. 214 for (int32 new_y = 0; new_y < height_; ++new_y) { 215 uint16x8_t accum1 = vdupq_n_u16(0); 216 uint16x8_t accum2 = vdupq_n_u16(0); 217 218 // Go top to bottom across the four rows of input pixels that make up 219 // this output row. 220 for (int32 row_num = 0; row_num < 4; ++row_num) { 221 // First 16 bytes. 222 { 223 // Load 32 bytes of data from current offset. 224 const uint8x16_t curr_data1 = vld1q_u8(offset); 225 // Pairwise add and accumulate into accum vectors (16 bit to account 226 // for values above 255). 227 accum1 = vpadalq_u8(accum1, curr_data1); 228 } 229 230 // Second 16 bytes. 231 { 232 // Load 32 bytes of data from current offset. 233 const uint8x16_t curr_data2 = vld1q_u8(offset + 16); 234 // Pairwise add and accumulate into accum vectors (16 bit to account 235 // for values above 255). 236 accum2 = vpadalq_u8(accum2, curr_data2); 237 } 238 239 // Move offset down one row. 240 offset += stride; 241 } 242 243 // Add and widen, then divide by 16 (number of input pixels per output 244 // pixel) and narrow data from 32 bits per pixel to 16 bpp. 245 const uint16x4_t tmp_pix1 = vqshrn_n_u32(vpaddlq_u16(accum1), 4); 246 const uint16x4_t tmp_pix2 = vqshrn_n_u32(vpaddlq_u16(accum2), 4); 247 248 // Combine 4x1 pixel strips into 8x1 pixel strip and narrow from 249 // 16 bits to 8 bits per pixel. 250 const uint8x8_t allpixels = vmovn_u16(vcombine_u16(tmp_pix1, tmp_pix2)); 251 252 // This points to the leftmost pixel of our 8 horizontally arranged 253 // pixels in the destination data. 254 uint8* const ptr_dst = getPixelPtr(new_x, new_y); 255 256 // Copy all pixels from composite 8x1 vector into output strip. 257 vst1_u8(ptr_dst, allpixels); 258 } 259 } 260 261 262 // Hardware accelerated downsampling method for supported devices. 263 // Requires that image size be a multiple of 16 pixels in each dimension, 264 // and that downsampling be by a factor of 4. 265 void downsampleAveragedNeon(const uint8* const original, 266 const int32 stride) { 267 // Hardcoded to only work on 4x downsampling. 268 const int32 orig_width = width_ * 4; 269 270 // We process 32 input pixels lengthwise at a time. 271 // The output per pass of this loop is an 8 wide by 1 tall pixel strip. 272 for (int32 orig_x = 0; orig_x < orig_width; orig_x += 32) { 273 // Push it to the left enough so that it never goes out of bounds. 274 // This will result in some extra computation on the last pass on 275 // devices whose frame widths are not multiples of 32. 276 downsample32ColumnsNeon(original, stride, min(orig_x, orig_width - 32)); 277 } 278 } 279#endif 280 281 282 // Naive downsampler that reduces image size by factor by averaging pixels in 283 // blocks of size factor x factor. 284 void downsampleAveraged(const T* const original, const int32 stride, 285 const int32 factor) { 286#ifdef HAVE_ARMEABI_V7A 287 if (supportsNeon() && 288 factor == 4 && 289 (height_ % 4) == 0) { 290 downsampleAveragedNeon(original, stride); 291 return; 292 } 293#endif 294 295 const int32 pixels_per_block = factor * factor; 296 297 // For every pixel in resulting image. 298 for (int32 y = 0; y < height_; ++y) { 299 const int32 orig_y = y * factor; 300 const int32 y_bound = orig_y + factor; 301 302 // Sum up the original pixels. 303 for (int32 x = 0; x < width_; ++x) { 304 const int32 orig_x = x * factor; 305 const int32 x_bound = orig_x + factor; 306 307 // Making this int32 because type U or T might overflow. 308 int32 pixel_sum = 0; 309 310 // Grab all the pixels that make up this pixel. 311 for (int32 curr_y = orig_y; curr_y < y_bound; ++curr_y) { 312 const T* p = original + curr_y * stride + orig_x; 313 314 for (int32 curr_x = orig_x; curr_x < x_bound; ++curr_x) { 315 pixel_sum += *p++; 316 } 317 } 318 319 setPixel(x, y, pixel_sum / pixels_per_block); 320 } 321 } 322 } 323 324 // Naive downsampler that reduces image size by factor by averaging pixels in 325 // blocks of size factor x factor. 326 void downsampleAveraged(const Image<T>& original, const int32 factor) { 327 downsampleAveraged(original.getPixelPtr(0, 0), original.getWidth(), factor); 328 } 329 330 // Relatively efficient downsampling of an image by a factor of two with a 331 // low-pass 3x3 smoothing operation thrown in. 332 void downsampleSmoothed3x3(const Image<T>& original) { 333 for (int32 y = 0; y < height_; ++y) { 334 const int32 orig_y = clip(2 * y, ZERO, original.height_less_one_); 335 const int32 min_y = clip(orig_y - 1, ZERO, original.height_less_one_); 336 const int32 max_y = clip(orig_y + 1, ZERO, original.height_less_one_); 337 338 for (int32 x = 0; x < width_; ++x) { 339 const int32 orig_x = clip(2 * x, ZERO, original.width_less_one_); 340 const int32 min_x = clip(orig_x - 1, ZERO, original.width_less_one_); 341 const int32 max_x = clip(orig_x + 1, ZERO, original.width_less_one_); 342 343 // Center. 344 int32 pixel_sum = original.getPixel(orig_x, orig_y) * 4; 345 346 // Sides. 347 pixel_sum += (original.getPixel(max_x, orig_y) + 348 original.getPixel(min_x, orig_y) + 349 original.getPixel(orig_x, max_y) + 350 original.getPixel(orig_x, min_y)) * 2; 351 352 // Diagonals. 353 pixel_sum += (original.getPixel(max_x, max_y) + 354 original.getPixel(min_x, max_y) + 355 original.getPixel(max_x, min_y) + 356 original.getPixel(min_x, min_y)); 357 358 const int32 pixel_val = pixel_sum>>4; // 16 359 360 //LOGV("Setting %d,%d to %d", col, row, pixel_val); 361 362 setPixel(x, y, pixel_val); 363 } 364 } 365 } 366 367 // Relatively efficient downsampling of an image by a factor of two with a 368 // low-pass 5x5 smoothing operation thrown in. 369 void downsampleSmoothed5x5(const Image<T>& original) { 370 const int32 max_x = original.width_less_one_; 371 const int32 max_y = original.height_less_one_; 372 373 // The JY Bouget paper on Lucas-Kanade recommends a 374 // [1/16 1/4 3/8 1/4 1/16]^2 filter. 375 // This works out to a [1 4 6 4 1]^2 / 256 array, precomputed below. 376 static const int32 window_radius = 2; 377 static const int32 window_size = window_radius*2 + 1; 378 static const int32 window_weights[] = {1, 4, 6, 4,1, // 16 + 379 4,16,24,16,4, // 64 + 380 6,24,36,24,6, // 96 + 381 4,16,24,16,4, // 64 + 382 1, 4, 6, 4,1}; // 16 = 256 383 384 // We'll multiply and sum with the the whole numbers first, then divide by 385 // the total weight to normalize at the last moment. 386 for (int32 y = 0; y < height_; ++y) { 387 for (int32 x = 0; x < width_; ++x) { 388 int32 pixel_sum = 0; 389 390 const int32* w = window_weights; 391 const int32 start_x = clip((x<<1) - window_radius, ZERO, max_x); 392 393 // Clip the boundaries to the size of the image. 394 for (int32 window_y = 0; window_y < window_size; ++window_y) { 395 const int32 start_y = 396 clip((y<<1) - window_radius + window_y, ZERO, max_y); 397 398 const T* p = original.getPixelPtrConst(start_x, start_y); 399 400 for (int32 window_x = 0; window_x < window_size; ++window_x) { 401 pixel_sum += *p++ * *w++; 402 } 403 } 404 405 // Conversion to type T will happen here after shifting right 8 bits to 406 // divide by 256. 407 setPixel(x, y, pixel_sum >> 8); 408 } 409 } 410 } 411 412 // Optimized Scharr filter on a single pixel in the X direction. 413 // Scharr filters are like central-difference operators, but have more 414 // rotational symmetry in their response because they also consider the 415 // diagonal neighbors. 416 template <typename U> 417 inline T scharrPixelX(const Image<U>& original, 418 const int32 center_x, const int32 center_y) const { 419 const int32 min_x = clip(center_x - 1, ZERO, original.width_less_one_); 420 const int32 max_x = clip(center_x + 1, ZERO, original.width_less_one_); 421 const int32 min_y = clip(center_y - 1, ZERO, original.height_less_one_); 422 const int32 max_y = clip(center_y + 1, ZERO, original.height_less_one_); 423 424 // Convolution loop unrolled for performance... 425 return (3 * (original.getPixel(max_x, min_y) 426 + original.getPixel(max_x, max_y) 427 - original.getPixel(min_x, min_y) 428 - original.getPixel(min_x, min_y)) 429 + 10 * (original.getPixel(max_x, center_y) 430 - original.getPixel(min_x, center_y))) / 32; 431 } 432 433 // Optimized Scharr filter on a single pixel in the X direction. 434 // Scharr filters are like central-difference operators, but have more 435 // rotational symmetry in their response because they also consider the 436 // diagonal neighbors. 437 template <typename U> 438 inline T scharrPixelY(const Image<U>& original, 439 const int32 center_x, const int32 center_y) const { 440 const int32 min_x = clip(center_x - 1, 0, original.width_less_one_); 441 const int32 max_x = clip(center_x + 1, 0, original.width_less_one_); 442 const int32 min_y = clip(center_y - 1, 0, original.height_less_one_); 443 const int32 max_y = clip(center_y + 1, 0, original.height_less_one_); 444 445 // Convolution loop unrolled for performance... 446 return (3 * (original.getPixel(min_x, max_y) 447 + original.getPixel(max_x, max_y) 448 - original.getPixel(min_x, min_y) 449 - original.getPixel(max_x, min_y)) 450 + 10 * (original.getPixel(center_x, max_y) 451 - original.getPixel(center_x, min_y))) / 32; 452 } 453 454 // Convolve the image with a Scharr filter in the X direction. 455 // Much faster than an equivalent generic convolution. 456 template <typename U> 457 inline void scharrX(const Image<U>& original) { 458 for (int32 y = 0; y < height_; ++y) { 459 for (int32 x = 0; x < width_; ++x) { 460 setPixel(x, y, scharrPixelX(original, x, y)); 461 } 462 } 463 } 464 465 // Convolve the image with a Scharr filter in the Y direction. 466 // Much faster than an equivalent generic convolution. 467 template <typename U> 468 inline void scharrY(const Image<U>& original) { 469 for (int32 y = 0; y < height_; ++y) { 470 for (int32 x = 0; x < width_; ++x) { 471 setPixel(x, y, scharrPixelY(original, x, y)); 472 } 473 } 474 } 475 476 static inline T halfDiff(int32 first, int32 second) { 477 return (second - first) / 2; 478 } 479 480 template <typename U> 481 void derivativeX(const Image<U>& original) { 482 for (int32 y = 0; y < height_; ++y) { 483 T* const dest_row = getPixelPtr(0, y); 484 const U* const source_row = original.getPixelPtrConst(0, y); 485 486 // Compute first pixel. 487 dest_row[0] = halfDiff(source_row[0], source_row[1]); 488 489 // Last pixel. 490 dest_row[width_less_one_] = halfDiff(source_row[width_less_one_ - 1], 491 source_row[width_less_one_]); 492 493 // All the pixels in between. 494 const U* const source_prev_pixel = source_row - 1; 495 const U* const source_next_pixel = source_row + 1; 496 for (int32 x = 1; x < width_less_one_; ++x) { 497 dest_row[x] = halfDiff(source_prev_pixel[x], source_next_pixel[x]); 498 } 499 } 500 } 501 502 template <typename U> 503 void derivativeY(const Image<U>& original) { 504 for (int32 y = 0; y < height_; ++y) { 505 T* const dest_row = getPixelPtr(0, y); 506 507 const U* const source_prev_pixel = 508 original.getPixelPtrConst(0, max(0, y - 1)); 509 510 const U* const source_next_pixel = 511 original.getPixelPtrConst(0, min(height_less_one_, y + 1)); 512 513 for (int32 x = 0; x < width_; ++x) { 514 dest_row[x] = halfDiff(source_prev_pixel[x], source_next_pixel[x]); 515 } 516 } 517 } 518 519 // Generic function for convolving pixel with 3x3 filter. 520 // Filter pixels should be in row major order. 521 template <typename U> 522 inline T convolvePixel3x3(const Image<U>& original, 523 const int32* const filter, 524 const int32 center_x, const int32 center_y, 525 const int32 total) const { 526 int32 sum = 0; 527 for (int32 filter_y = 0; filter_y < 3; ++filter_y) { 528 const int32 y = clip(center_y - 1 + filter_y, 0, original.getHeight()); 529 for (int32 filter_x = 0; filter_x < 3; ++filter_x) { 530 const int32 x = clip(center_x - 1 + filter_x, 0, original.getWidth()); 531 sum += original.getPixel(x, y) * filter[filter_y * 3 + filter_x]; 532 } 533 } 534 return sum / total; 535 } 536 537 // Generic function for convolving an image with a 3x3 filter. 538 // TODO(andrewharp): Generalize this for any size filter. 539 template <typename U> 540 inline void convolve3x3(const Image<U>& original, 541 const int32* const filter) { 542 int32 sum = 0; 543 for (int32 i = 0; i < 9; ++i) { 544 sum += abs(filter[i]); 545 } 546 for (int32 y = 0; y < height_; ++y) { 547 for (int32 x = 0; x < width_; ++x) { 548 setPixel(x, y, convolvePixel3x3(original, filter, x, y, sum)); 549 } 550 } 551 } 552 553 // Load this image's data from a data array. The data at pixels is assumed to 554 // have dimensions equivalent to this image's dimensions * factor. 555 inline void fromArray(const T* const pixels, const int32 stride, 556 const int32 factor) { 557 if (factor == 1) { 558 // If not subsampling, memcpy per line should be faster. 559 memcpy(this->image_data_, pixels, num_pixels_ * sizeof(T)); 560 return; 561 } 562 563 downsampleAveraged(pixels, stride, factor); 564 } 565 566 // Copy the image back out to an appropriately sized data array. 567 inline void toArray(T* const pixels) const { 568 // If not subsampling, memcpy should be faster. 569 memcpy(pixels, this->image_data_, num_pixels_ * sizeof(T)); 570 } 571 572 private: 573 inline void allocate() { 574 image_data_ = (T*)malloc(num_pixels_ * sizeof(T)); 575 if (image_data_ == NULL) { 576 LOGE("Couldn't allocate image data!"); 577 } 578 } 579 580 T* image_data_; 581 int32 width_; 582 int32 height_; 583 584 public: 585 // Precompute these for efficiency's sake as they're used by a lot of 586 // clipping code and loop code. 587 // TODO(andrewharp): make these only accessible by other Images. 588 int32 width_less_one_; 589 int32 height_less_one_; 590 int32 num_pixels_; 591}; 592 593 594// Create a pyramid of downsampled images. The first level of the pyramid is the 595// original image. 596inline void computeSmoothedPyramid(const Image<uint8>& frame, 597 const int32 num_levels, 598 Image<uint8>** const pyramid) { 599 // TODO(andrewharp): Find a const correct solution to this... 600 // Maybe make an pyramid class with the first level of this pyramid as a 601 // separate pointer? 602 603 // Cast away const, but we're not going to hurt it, honest! 604 pyramid[0] = const_cast<Image<uint8>*>(&frame); 605 606 for (int32 l = 1; l < num_levels; ++l) { 607 pyramid[l]->downsampleSmoothed3x3(*pyramid[l - 1]); 608 } 609} 610 611 612// Create a spatial derivative pyramid based on a downsampled pyramid. 613inline void computeSpatialPyramid(const Image<uint8>** const pyramid, 614 const int32 num_levels, 615 Image<int32>** pyramid_x, 616 Image<int32>** pyramid_y) { 617 for (int32 l = 0; l < num_levels; ++l) { 618 const Image<uint8>& frame = *pyramid[l]; 619 620 // Fast convolutions to find spatial derivatives. 621 pyramid_x[l]->derivativeX(frame); 622 pyramid_y[l]->derivativeY(frame); 623 } 624} 625 626 627// Marks a circle of a given radius on the boolean image. 628// If the center spot is already marked, don't do anything and return false. 629// Otherwise, mark everything in range true and return true. 630template <typename U> 631inline static bool markImage(const int32 x, const int32 y, 632 const int32 radius, 633 Image<U>* const img) { 634 if (img->getPixel(x, y)) { 635 // Already claimed, sorry. 636 return false; 637 } 638 639 const int32 squared_radius = square(radius); 640 641 for (int32 d_y = 0; d_y < radius; ++d_y) { 642 const int32 squared_y_dist = square(d_y); 643 644 const int32 min_y = y > d_y ? y - d_y : ZERO; 645 const int32 max_y = min(y + d_y, img->height_less_one_); 646 647 for (int32 d_x = 0; d_x < radius; ++d_x) { 648 if (squared_y_dist + square(d_x) <= squared_radius) { 649 const int32 min_x = x > d_x ? x - d_x : ZERO; 650 const int32 max_x = min(x + d_x, img->width_less_one_); 651 652 // Mark all four quadrants. 653 img->setPixel(max_x, max_y, true); 654 img->setPixel(max_x, min_y, true); 655 img->setPixel(min_x, max_y, true); 656 img->setPixel(min_x, min_y, true); 657 } else { 658 // Once we're too far out, we're not coming back in. 659 break; 660 } 661 } 662 } 663 return true; 664} 665 666 667// Puts the image gradient matrix about a pixel into the 2x2 float array G. 668// vals_x should be an array of the window x gradient values, whose indices 669// can be in any order but are parallel to the vals_y entries. 670// See http://robots.stanford.edu/cs223b04/algo_tracking.pdf for more details. 671inline void calculateG(const float32* const vals_x, const float32* const vals_y, 672 const int32 num_vals, float* const G) { 673 // Defined here because we want to keep track of how many values were 674 // processed by NEON, so that we can finish off the remainder the normal 675 // way. 676 int32 i = 0; 677 678#ifdef HAVE_ARMEABI_V7A 679 if (supportsNeon()) { 680 const float32_t* const arm_vals_x = (const float32_t*) vals_x; 681 const float32_t* const arm_vals_y = (const float32_t*) vals_y; 682 683 // Running sums. 684 float32x4_t xx = vdupq_n_f32(0.0f); 685 float32x4_t xy = vdupq_n_f32(0.0f); 686 float32x4_t yy = vdupq_n_f32(0.0f); 687 688 // Maximum index we can load 4 consecutive values from. 689 // e.g. if there are 81 values, our last full pass can be from index 77: 690 // 81-4=>77 (77, 78, 79, 80) 691 const int32 max_i = num_vals - 4; 692 693 // Process values 4 at a time, accumulating the sums of 694 // the pixel-wise x*x, x*y, and y*y values. 695 for (; i <= max_i; i += 4) { 696 // Load xs 697 float32x4_t x = vld1q_f32(arm_vals_x + i); 698 699 // Multiply x*x and accumulate. 700 xx = vmlaq_f32(xx, x, x); 701 702 // Load ys 703 float32x4_t y = vld1q_f32(arm_vals_y + i); 704 705 // Multiply x*y and accumulate. 706 xy = vmlaq_f32(xy, x, y); 707 708 // Multiply y*y and accumulate. 709 yy = vmlaq_f32(yy, y, y); 710 } 711 712 static float32_t xx_vals[4]; 713 static float32_t xy_vals[4]; 714 static float32_t yy_vals[4]; 715 716 vst1q_f32(xx_vals, xx); 717 vst1q_f32(xy_vals, xy); 718 vst1q_f32(yy_vals, yy); 719 720 // Accumulated values are store in sets of 4, we have to manually add 721 // the last bits together. 722 for (int32 j = 0; j < 4; ++j) { 723 G[0] += xx_vals[j]; 724 G[1] += xy_vals[j]; 725 G[3] += yy_vals[j]; 726 } 727 } 728#endif 729 // Non-accelerated version, also finishes off last few values (< 4) from 730 // above. 731 for (; i < num_vals; ++i) { 732 G[0] += square(vals_x[i]); 733 G[1] += vals_x[i] * vals_y[i]; 734 G[3] += square(vals_y[i]); 735 } 736 737 // The matrix is symmetric, so this is a given. 738 G[2] = G[1]; 739} 740 741 742// Puts the image gradient matrix about a pixel into the 2x2 float array G. 743// Looks up interpolated pixels, then calls above method for implementation. 744inline void calculateG(const int window_size, 745 const float32 center_x, const float center_y, 746 const Image<int32>& I_x, const Image<int32>& I_y, 747 float* const G) { 748 CHECK(I_x.validPixel(center_x, center_y), "Problem in calculateG!"); 749 750 // Hardcoded to allow for a max window radius of 5 (9 pixels x 9 pixels). 751 static const int kMaxWindowRadius = 5; 752 CHECK(window_size <= kMaxWindowRadius, 753 "Window %d > %d!", window_size, kMaxWindowRadius); 754 755 // Diameter of window is 2 * radius + 1 for center pixel. 756 static const int kWindowBufferSize = 757 (kMaxWindowRadius * 2 + 1) * (kMaxWindowRadius * 2 + 1); 758 759 // Preallocate buffers statically for efficiency. 760 static float32 vals_x[kWindowBufferSize]; 761 static float32 vals_y[kWindowBufferSize]; 762 763 int32 num_vals = 0; 764 765 for (int32 win_x = -window_size; win_x <= window_size; ++win_x) { 766 for (int32 win_y = -window_size; win_y <= window_size; ++win_y) { 767 vals_x[num_vals] = I_x.getPixelInterp(center_x + win_x, 768 center_y + win_y); 769 vals_y[num_vals] = I_y.getPixelInterp(center_x + win_x, 770 center_y + win_y); 771 ++num_vals; 772 } 773 } 774 calculateG(vals_x, vals_y, num_vals, G); 775} 776 777} // namespace flow 778 779#endif // JAVA_COM_GOOGLE_ANDROID_APPS_UNVEIL_JNI_OPTICALFLOW_IMAGE_H_