PageRenderTime 86ms CodeModel.GetById 12ms app.highlight 70ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.0.0-rc0/hive/external/odbc/src/cpp/HiveRowSet.cpp

#
C++ | 465 lines | 358 code | 45 blank | 62 comment | 86 complexity | ef0ac1e322352f29e2b9a0f23dd3923d MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19#include <assert.h>
 20#include <string.h>
 21
 22#include "HiveRowSet.h"
 23#include "hiveclienthelper.h"
 24
 25
 26/*************************************************************************************************
 27 * Base HiveRowSet Class Logic
 28 ************************************************************************************************/
 29
 30HiveRowSet::HiveRowSet() {
 31  m_is_completely_read = false;
 32  m_bytes_read = 0;
 33  m_last_column_fetched = 0;
 34  m_field_buffer[0] = '\0';
 35}
 36
 37HiveRowSet::~HiveRowSet() {
 38}
 39
 40void HiveRowSet::reset() {
 41  m_is_completely_read = false;
 42  m_bytes_read = 0;
 43  m_last_column_fetched = 0;
 44  m_field_buffer[0] = '\0';
 45  /* Non Virtual Calls Pure Virtual Idiom */
 46  specialized_reset(); /* Call the specialized subclass reset method */
 47}
 48
 49void HiveRowSet::initFieldBuffer() {
 50  /* m_field_buffer should always correspond to the field indicated by m_last_column_fetched*/
 51  extractField(m_last_column_fetched);
 52}
 53
 54HiveReturn HiveRowSet::getFieldDataLen(size_t column_idx, size_t* col_len, char* err_buf,
 55                                       size_t err_buf_len) {
 56  RETURN_ON_ASSERT(col_len == NULL, __FUNCTION__,
 57                   "Pointer to col_len (output) cannot be NULL.", err_buf, err_buf_len, HIVE_ERROR);
 58  RETURN_ON_ASSERT(getColumnCount() == 0, __FUNCTION__,
 59                   "Rowset contains zero columns.", err_buf, err_buf_len, HIVE_ERROR);
 60  RETURN_ON_ASSERT(column_idx >= getColumnCount(), __FUNCTION__,
 61                   "Column index out of bounds.", err_buf, err_buf_len, HIVE_ERROR);
 62  *col_len = getFieldLen(column_idx);
 63  return HIVE_SUCCESS;
 64}
 65
 66HiveReturn HiveRowSet::getFieldAsCString(size_t column_idx, char* buffer, size_t buffer_len,
 67                                         size_t* data_byte_size, int* is_null_value, char* err_buf,
 68                                         size_t err_buf_len) {
 69  RETURN_ON_ASSERT(buffer == NULL, __FUNCTION__,
 70                   "Column data output buffer cannot be NULL.", err_buf, err_buf_len, HIVE_ERROR);
 71  RETURN_ON_ASSERT(is_null_value == NULL, __FUNCTION__,
 72                   "Column data is_null_value (output) cannot be NULL.", err_buf, err_buf_len,
 73                   HIVE_ERROR);
 74  RETURN_ON_ASSERT(getColumnCount() == 0, __FUNCTION__,
 75                   "Rowset contains zero columns.", err_buf, err_buf_len, HIVE_ERROR);
 76  RETURN_ON_ASSERT(column_idx >= getColumnCount(), __FUNCTION__,
 77                   "Column index out of bounds.", err_buf, err_buf_len, HIVE_ERROR);
 78  RETURN_ON_ASSERT(buffer_len == 0, __FUNCTION__,
 79                   "Output buffer cannot have a size of zero.", err_buf, err_buf_len, HIVE_ERROR);
 80
 81  if (m_last_column_fetched != column_idx) {
 82    extractField(column_idx);
 83    m_bytes_read = 0; /* Reset the read offset if different from the last column fetched */
 84    m_last_column_fetched = column_idx;
 85    m_is_completely_read = false;
 86  }
 87  if (m_is_completely_read) {
 88    return HIVE_NO_MORE_DATA; /* This field has already been completely fetched by a previous call*/
 89  }
 90  /* If the column data is the same as the null format spec... */
 91  if (strcmp(getNullFormat(), m_field_buffer) == 0) {
 92    /* This value must be NULL */
 93    *is_null_value = 1;
 94    if (data_byte_size != NULL) {
 95      *data_byte_size = 0;
 96    }
 97    buffer[0] = '\0';
 98  } else {
 99    /* This value has been determined not to be NULL */
100    *is_null_value = 0;
101    size_t data_total_len = getFieldLen(column_idx);
102    /* Cannot read more data then the total number of bytes available */
103    assert(data_total_len >= m_bytes_read);
104    size_t bytes_remaining = data_total_len - m_bytes_read; // Excludes null char
105    if (data_byte_size != NULL) {
106      /* Save the number of remaining characters to return before this fetch */
107      *data_byte_size = bytes_remaining;
108    }
109    /* Move pointer to the read location */
110    const char* src_str_ptr = m_field_buffer + m_bytes_read;
111    /* The total number of bytes to read (+1 null terminator) should be no more than the
112     * size of the field buffer */
113    assert(m_bytes_read + bytes_remaining + 1 <= sizeof(m_field_buffer));
114    /* Copy as many characters as possible from the read location */
115    size_t bytes_copied = safe_strncpy(buffer, src_str_ptr, min(buffer_len, bytes_remaining + 1)); // +1 for null terminator
116    /* bytes_copied does not count the null terminator */
117    m_bytes_read += bytes_copied;
118    if (m_bytes_read < data_total_len) {
119      return HIVE_SUCCESS_WITH_MORE_DATA; /* Data truncated; more data to return */
120    }
121  }
122  m_is_completely_read = true;
123  return HIVE_SUCCESS; /* All data successfully read */
124}
125
126HiveReturn HiveRowSet::getFieldAsDouble(size_t column_idx, double* buffer, int* is_null_value,
127                                        char* err_buf, size_t err_buf_len) {
128  RETURN_ON_ASSERT(buffer == NULL, __FUNCTION__,
129                   "Column data output buffer cannot be NULL.", err_buf, err_buf_len, HIVE_ERROR);
130  RETURN_ON_ASSERT(is_null_value == NULL, __FUNCTION__,
131                   "Column data is_null_value (output) cannot be NULL.", err_buf, err_buf_len,
132                   HIVE_ERROR);
133  RETURN_ON_ASSERT(getColumnCount() == 0, __FUNCTION__,
134                   "Rowset contains zero columns.", err_buf, err_buf_len, HIVE_ERROR);
135  RETURN_ON_ASSERT(column_idx >= getColumnCount(), __FUNCTION__,
136                   "Column index out of bounds.", err_buf, err_buf_len, HIVE_ERROR);
137
138  if (m_last_column_fetched != column_idx) {
139    /* Reset if this column was not fetched on the last attempt */
140    extractField(column_idx);
141    m_bytes_read = 0; /* Reset the read offset if different from the last column fetched */
142    m_last_column_fetched = column_idx;
143    m_is_completely_read = false;
144  }
145  if (m_is_completely_read) {
146    return HIVE_NO_MORE_DATA; /* This column has already been completely fetched */
147  }
148  /* If the column data is the same as the nullformat spec... */
149  if (strcmp(getNullFormat(), m_field_buffer) == 0) {
150    *is_null_value = 1;
151    *buffer = 0.0;
152  } else {
153    *is_null_value = 0;
154    *buffer = atof(m_field_buffer);
155  }
156  m_is_completely_read = true;
157  return HIVE_SUCCESS;
158}
159
160HiveReturn HiveRowSet::getFieldAsInt(size_t column_idx, int* buffer, int* is_null_value,
161                                     char* err_buf, size_t err_buf_len) {
162  RETURN_ON_ASSERT(buffer == NULL, __FUNCTION__,
163                   "Column data output buffer cannot be NULL.", err_buf, err_buf_len, HIVE_ERROR);
164  RETURN_ON_ASSERT(is_null_value == NULL, __FUNCTION__,
165                   "Column data is_null_value (output) cannot be NULL.", err_buf, err_buf_len,
166                   HIVE_ERROR);
167  RETURN_ON_ASSERT(getColumnCount() == 0, __FUNCTION__,
168                   "Rowset contains zero columns.", err_buf, err_buf_len, HIVE_ERROR);
169  RETURN_ON_ASSERT(column_idx >= getColumnCount(), __FUNCTION__,
170                   "Column index out of bounds.", err_buf, err_buf_len, HIVE_ERROR);
171
172  if (m_last_column_fetched != column_idx) {
173    extractField(column_idx);
174    m_bytes_read = 0; /* Reset the read offset if different from the last column fetched */
175    m_last_column_fetched = column_idx;
176    m_is_completely_read = false;
177  }
178  if (m_is_completely_read) {
179    return HIVE_NO_MORE_DATA; /* This column has already been completely fetched */
180  }
181  /* If the column data is the same as the null format spec... */
182  if (strcmp(getNullFormat(), m_field_buffer) == 0) {
183    *is_null_value = 1;
184    *buffer = 0;
185  } else {
186    *is_null_value = 0;
187    *buffer = atoi(m_field_buffer);
188  }
189  m_is_completely_read = true;
190  return HIVE_SUCCESS;
191}
192
193HiveReturn HiveRowSet::getFieldAsLong(size_t column_idx, long* buffer, int* is_null_value,
194                                      char* err_buf, size_t err_buf_len) {
195  RETURN_ON_ASSERT(buffer == NULL, __FUNCTION__,
196                   "Column data output buffer cannot be NULL.", err_buf, err_buf_len, HIVE_ERROR);
197  RETURN_ON_ASSERT(is_null_value == NULL, __FUNCTION__,
198                   "Column data is_null_value (output) cannot be NULL.", err_buf, err_buf_len,
199                   HIVE_ERROR);
200  RETURN_ON_ASSERT(getColumnCount() == 0, __FUNCTION__,
201                   "Rowset contains zero columns.", err_buf, err_buf_len, HIVE_ERROR);
202  RETURN_ON_ASSERT(column_idx >= getColumnCount(), __FUNCTION__,
203                   "Column index out of bounds.", err_buf, err_buf_len, HIVE_ERROR);
204
205  if (m_last_column_fetched != column_idx) {
206    extractField(column_idx);
207    m_bytes_read = 0; /* Reset the read offset if different from the last column fetched */
208    m_last_column_fetched = column_idx;
209    m_is_completely_read = false;
210  }
211  if (m_is_completely_read) {
212    return HIVE_NO_MORE_DATA; /* This column has already been completely fetched */
213  }
214  /* If the column data is the same as the null format spec... */
215  if (strcmp(getNullFormat(), m_field_buffer) == 0) {
216    *is_null_value = 1;
217    *buffer = 0;
218  } else {
219    *is_null_value = 0;
220    *buffer = atol(m_field_buffer);
221  }
222  m_is_completely_read = true;
223  return HIVE_SUCCESS;
224}
225
226HiveReturn HiveRowSet::getFieldAsULong(size_t column_idx, unsigned long* buffer,
227                                       int* is_null_value, char* err_buf, size_t err_buf_len) {
228  RETURN_ON_ASSERT(buffer == NULL, __FUNCTION__,
229                   "Column data output buffer cannot be NULL.", err_buf, err_buf_len, HIVE_ERROR);
230  RETURN_ON_ASSERT(is_null_value == NULL, __FUNCTION__,
231                   "Column data is_null_value (output) cannot be NULL.", err_buf, err_buf_len,
232                   HIVE_ERROR);
233  RETURN_ON_ASSERT(getColumnCount() == 0, __FUNCTION__,
234                   "Rowset contains zero columns.", err_buf, err_buf_len, HIVE_ERROR);
235  RETURN_ON_ASSERT(column_idx >= getColumnCount(), __FUNCTION__,
236                   "Column index out of bounds.", err_buf, err_buf_len, HIVE_ERROR);
237
238  if (m_last_column_fetched != column_idx) {
239    extractField(column_idx);
240    m_bytes_read = 0; /* Reset the read offset if different from the last column fetched */
241    m_last_column_fetched = column_idx;
242    m_is_completely_read = false;
243  }
244  if (m_is_completely_read) {
245    return HIVE_NO_MORE_DATA; /* This column has already been completely fetched */
246  }
247  /* If the column data is the same as the null format spec... */
248  if (strcmp(getNullFormat(), m_field_buffer) == 0) {
249    *is_null_value = 1;
250    *buffer = 0;
251  } else {
252    *is_null_value = 0;
253    *buffer = strtoul(m_field_buffer, NULL, 10);
254  }
255  m_is_completely_read = true;
256  return HIVE_SUCCESS;
257}
258
259HiveReturn HiveRowSet::getFieldAsI64(size_t column_idx, int64_t* buffer, int* is_null_value,
260                                     char* err_buf, size_t err_buf_len) {
261  RETURN_ON_ASSERT(buffer == NULL, __FUNCTION__,
262                   "Column data output buffer cannot be NULL.", err_buf, err_buf_len, HIVE_ERROR);
263  RETURN_ON_ASSERT(is_null_value == NULL, __FUNCTION__,
264                   "Column data is_null_value (output) cannot be NULL.", err_buf, err_buf_len,
265                   HIVE_ERROR);
266  RETURN_ON_ASSERT(getColumnCount() == 0, __FUNCTION__,
267                   "Rowset contains zero columns.", err_buf, err_buf_len, HIVE_ERROR);
268  RETURN_ON_ASSERT(column_idx >= getColumnCount(), __FUNCTION__,
269                   "Column index out of bounds.", err_buf, err_buf_len, HIVE_ERROR);
270
271  if (m_last_column_fetched != column_idx) {
272    extractField(column_idx);
273    m_bytes_read = 0; /* Reset the read offset if different from the last column fetched */
274    m_last_column_fetched = column_idx;
275    m_is_completely_read = false;
276  }
277  if (m_is_completely_read) {
278    return HIVE_NO_MORE_DATA; /* This column has already been completely fetched */
279  }
280  /* If the column data is the same as the null format spec... */
281  if (strcmp(getNullFormat(), m_field_buffer) == 0) {
282    *is_null_value = 1;
283    *buffer = 0;
284  } else {
285    *is_null_value = 0;
286    *buffer = ATOI64(m_field_buffer);
287  }
288  m_is_completely_read = true;
289  return HIVE_SUCCESS;
290}
291
292HiveReturn HiveRowSet::getFieldAsI64U(size_t column_idx, uint64_t* buffer, int* is_null_value,
293                                      char* err_buf, size_t err_buf_len) {
294  RETURN_ON_ASSERT(buffer == NULL, __FUNCTION__,
295                   "Column data output buffer cannot be NULL.", err_buf, err_buf_len, HIVE_ERROR);
296  RETURN_ON_ASSERT(is_null_value == NULL, __FUNCTION__,
297                   "Column data is_null_value (output) cannot be NULL.", err_buf, err_buf_len,
298                   HIVE_ERROR);
299  RETURN_ON_ASSERT(getColumnCount() == 0, __FUNCTION__,
300                   "Rowset contains zero columns.", err_buf, err_buf_len, HIVE_ERROR);
301  RETURN_ON_ASSERT(column_idx >= getColumnCount(), __FUNCTION__,
302                   "Column index out of bounds.", err_buf, err_buf_len, HIVE_ERROR);
303
304  if (m_last_column_fetched != column_idx) {
305    extractField(column_idx);
306    m_bytes_read = 0; /* Reset the read offset if different from the last column fetched */
307    m_last_column_fetched = column_idx;
308    m_is_completely_read = false;
309  }
310  if (m_is_completely_read) {
311    return HIVE_NO_MORE_DATA; /* This column has already been completely fetched */
312  }
313  /* If the column data is the same as the null format spec... */
314  if (strcmp(getNullFormat(), m_field_buffer) == 0) {
315    *is_null_value = 1;
316    *buffer = 0;
317  } else {
318    *is_null_value = 0;
319    *buffer = ATOI64U(m_field_buffer);
320  }
321  m_is_completely_read = true;
322  return HIVE_SUCCESS;
323}
324
325/*************************************************************************************************
326 * HiveSerializedRowSet Subclass Definition
327 ************************************************************************************************/
328
329HiveSerializedRowSet::HiveSerializedRowSet() {
330  m_row_weak_ptr = NULL;
331  m_null_format_weak_ptr = NULL;
332}
333
334HiveSerializedRowSet::~HiveSerializedRowSet() {
335  /* Nothing to deallocate */
336}
337
338void HiveSerializedRowSet::initialize(Apache::Hadoop::Hive::Schema& schema, string& serialized_row) {
339  m_row_weak_ptr = &serialized_row;
340  /* Allocate sufficient space to prevent further resizing */
341  m_field_offsets.reserve(schema.fieldSchemas.size());
342  initializeOffsets(schema, serialized_row); // Initialize m_field_offsets
343  assert(m_field_offsets.size() == schema.fieldSchemas.size());
344  assert(schema.properties[SERIALIZATION_NULL_FORMAT].length() > 0);
345  m_null_format_weak_ptr = &(schema.properties[SERIALIZATION_NULL_FORMAT]);
346  /* Synchronize m_field_buffer and m_last_column_fetched now that extractField() works */
347  initFieldBuffer();
348}
349
350/* This method should never be called outside of the inherited HiveRowSet::reset() */
351void HiveSerializedRowSet::specialized_reset() {
352  m_row_weak_ptr = NULL;
353  m_field_offsets.clear();
354  m_null_format_weak_ptr = NULL;
355}
356
357void HiveSerializedRowSet::initializeOffsets(Apache::Hadoop::Hive::Schema& schema, string& serialized_row) {
358  m_field_offsets.push_back(0); // There will always be at least one column
359  // Keep a temporary field_delim reference so we don't have to keep using the map
360  string& field_delim(schema.properties[FIELD_DELIM]);
361  assert(field_delim.length() > 0);
362
363  // Assumes that field delimiters will only be one character
364  size_t idx = serialized_row.find_first_of(field_delim);
365  while (idx != string::npos) {
366    // Set the field offset to the start of the following field
367    m_field_offsets.push_back(idx + 1);
368    idx = serialized_row.find_first_of(field_delim, idx + 1);
369  }
370}
371
372size_t HiveSerializedRowSet::getColumnCount() {
373  return m_field_offsets.size();
374}
375
376const char* HiveSerializedRowSet::getNullFormat() {
377  assert(m_null_format_weak_ptr != NULL);
378  return m_null_format_weak_ptr->c_str();
379}
380
381size_t HiveSerializedRowSet::getFieldLen(size_t column_idx) {
382  assert(column_idx < getColumnCount());
383  assert(m_row_weak_ptr != NULL);
384  size_t len;
385  // If this is the last column...
386  if (column_idx == getColumnCount() - 1) {
387    assert(m_row_weak_ptr->length() >= m_field_offsets[column_idx]);
388    len = m_row_weak_ptr->length() - m_field_offsets[column_idx];
389  } else {
390    assert(m_field_offsets[column_idx + 1] > m_field_offsets[column_idx]);
391    len = m_field_offsets[column_idx + 1] - m_field_offsets[column_idx] - 1;
392  }
393  /* Enforce the constraint that no data exceed MAX_BYTE_LENGTH */
394  len = min(len, (size_t) MAX_BYTE_LENGTH);
395  return len;
396}
397
398void HiveSerializedRowSet::extractField(size_t column_idx) {
399  assert(column_idx < getColumnCount());
400  assert(m_row_weak_ptr != NULL);
401  /* The field buffer should always be large enough to hold the field */
402  assert(getFieldLen(column_idx) < sizeof(m_field_buffer));
403  /* Just safety precaution to prevent buffer overflow */
404  /* Reduce buffer size by one to save space for null terminator */
405  size_t extract_len = min(getFieldLen(column_idx), sizeof(m_field_buffer) - 1);
406  size_t copied = m_row_weak_ptr->copy(m_field_buffer, extract_len, m_field_offsets[column_idx]);
407  assert(copied == extract_len);
408  /* Make sure the buffer is null terminated */
409  m_field_buffer[extract_len] = '\0';
410}
411
412/*************************************************************************************************
413 * HiveStringVectorRowSet Subclass Definition
414 ************************************************************************************************/
415
416HiveStringVectorRowSet::HiveStringVectorRowSet() {
417  m_fields_weak_ptr = NULL;
418  m_null_format_weak_ptr = NULL;
419}
420
421HiveStringVectorRowSet::~HiveStringVectorRowSet() {
422  /* Nothing to deallocate */
423}
424
425void HiveStringVectorRowSet::initialize(Apache::Hadoop::Hive::Schema& schema, vector<string>* fields) {
426  assert(fields != NULL);
427  m_fields_weak_ptr = fields;
428  assert(schema.properties[SERIALIZATION_NULL_FORMAT].length() > 0);
429  m_null_format_weak_ptr = &(schema.properties[SERIALIZATION_NULL_FORMAT]);
430  /* Synchronize m_field_buffer and m_last_column_fetched now that extractField() works */
431  initFieldBuffer();
432}
433
434/* This method should never be called outside of the inherited HiveRowSet::reset() */
435void HiveStringVectorRowSet::specialized_reset() {
436  m_fields_weak_ptr = NULL;
437  m_null_format_weak_ptr = NULL;
438}
439
440size_t HiveStringVectorRowSet::getColumnCount() {
441  assert(m_fields_weak_ptr != NULL);
442  return m_fields_weak_ptr->size();
443}
444
445const char* HiveStringVectorRowSet::getNullFormat() {
446  assert(m_null_format_weak_ptr != NULL);
447  return m_null_format_weak_ptr->c_str();
448}
449
450size_t HiveStringVectorRowSet::getFieldLen(size_t column_idx) {
451  assert(column_idx < getColumnCount());
452  assert(m_fields_weak_ptr != NULL);
453  size_t len = m_fields_weak_ptr->at(column_idx).length();
454  /* Enforce the constraint that no data exceed MAX_BYTE_LENGTH */
455  len = min(len, (size_t) MAX_BYTE_LENGTH);
456  return len;
457}
458
459void HiveStringVectorRowSet::extractField(size_t column_idx) {
460  assert(column_idx < getColumnCount());
461  assert(m_fields_weak_ptr != NULL);
462  safe_strncpy(m_field_buffer, m_fields_weak_ptr->at(column_idx).c_str(), sizeof(m_field_buffer));
463}
464
465