PageRenderTime 1374ms CodeModel.GetById 644ms app.highlight 96ms RepoModel.GetById 628ms app.codeStats 1ms

/Python/pystrtod.c

http://unladen-swallow.googlecode.com/
C | 490 lines | 278 code | 65 blank | 147 comment | 147 complexity | 6ee33d6c23737a7da6c4b362aa2aed7d MD5 | raw file
  1/* -*- Mode: C; c-file-style: "python" -*- */
  2
  3#include <Python.h>
  4#include <locale.h>
  5
  6/* ascii character tests (as opposed to locale tests) */
  7#define ISSPACE(c)  ((c) == ' ' || (c) == '\f' || (c) == '\n' || \
  8                     (c) == '\r' || (c) == '\t' || (c) == '\v')
  9#define ISDIGIT(c)  ((c) >= '0' && (c) <= '9')
 10
 11
 12/**
 13 * PyOS_ascii_strtod:
 14 * @nptr:    the string to convert to a numeric value.
 15 * @endptr:  if non-%NULL, it returns the character after
 16 *           the last character used in the conversion.
 17 * 
 18 * Converts a string to a #gdouble value.
 19 * This function behaves like the standard strtod() function
 20 * does in the C locale. It does this without actually
 21 * changing the current locale, since that would not be
 22 * thread-safe.
 23 *
 24 * This function is typically used when reading configuration
 25 * files or other non-user input that should be locale independent.
 26 * To handle input from the user you should normally use the
 27 * locale-sensitive system strtod() function.
 28 *
 29 * If the correct value would cause overflow, plus or minus %HUGE_VAL
 30 * is returned (according to the sign of the value), and %ERANGE is
 31 * stored in %errno. If the correct value would cause underflow,
 32 * zero is returned and %ERANGE is stored in %errno.
 33 * If memory allocation fails, %ENOMEM is stored in %errno.
 34 * 
 35 * This function resets %errno before calling strtod() so that
 36 * you can reliably detect overflow and underflow.
 37 *
 38 * Return value: the #gdouble value.
 39 **/
 40double
 41PyOS_ascii_strtod(const char *nptr, char **endptr)
 42{
 43	char *fail_pos;
 44	double val = -1.0;
 45	struct lconv *locale_data;
 46	const char *decimal_point;
 47	size_t decimal_point_len;
 48	const char *p, *decimal_point_pos;
 49	const char *end = NULL; /* Silence gcc */
 50	const char *digits_pos = NULL;
 51	int negate = 0;
 52
 53	assert(nptr != NULL);
 54
 55	fail_pos = NULL;
 56
 57	locale_data = localeconv();
 58	decimal_point = locale_data->decimal_point;
 59	decimal_point_len = strlen(decimal_point);
 60
 61	assert(decimal_point_len != 0);
 62
 63	decimal_point_pos = NULL;
 64
 65	/* We process any leading whitespace and the optional sign manually,
 66	   then pass the remainder to the system strtod.  This ensures that
 67	   the result of an underflow has the correct sign. (bug #1725)  */
 68
 69	p = nptr;
 70	/* Skip leading space */
 71	while (ISSPACE(*p))
 72		p++;
 73
 74	/* Process leading sign, if present */
 75	if (*p == '-') {
 76		negate = 1;
 77		p++;
 78	} else if (*p == '+') {
 79		p++;
 80	}
 81
 82	/* What's left should begin with a digit, a decimal point, or one of
 83	   the letters i, I, n, N. It should not begin with 0x or 0X */
 84	if ((!ISDIGIT(*p) &&
 85	     *p != '.' && *p != 'i' && *p != 'I' && *p != 'n' && *p != 'N')
 86	    ||
 87	    (*p == '0' && (p[1] == 'x' || p[1] == 'X')))
 88	{
 89		if (endptr)
 90			*endptr = (char*)nptr;
 91		errno = EINVAL;
 92		return val;
 93	}
 94	digits_pos = p;
 95
 96	if (decimal_point[0] != '.' || 
 97	    decimal_point[1] != 0)
 98	{
 99		while (ISDIGIT(*p))
100			p++;
101
102		if (*p == '.')
103		{
104			decimal_point_pos = p++;
105
106			while (ISDIGIT(*p))
107				p++;
108
109			if (*p == 'e' || *p == 'E')
110				p++;
111			if (*p == '+' || *p == '-')
112				p++;
113			while (ISDIGIT(*p))
114				p++;
115			end = p;
116		}
117		else if (strncmp(p, decimal_point, decimal_point_len) == 0)
118		{
119			/* Python bug #1417699 */
120			if (endptr)
121				*endptr = (char*)nptr;
122			errno = EINVAL;
123			return val;
124		}
125		/* For the other cases, we need not convert the decimal
126		   point */
127	}
128
129	/* Set errno to zero, so that we can distinguish zero results
130	   and underflows */
131	errno = 0;
132
133	if (decimal_point_pos)
134	{
135		char *copy, *c;
136
137		/* We need to convert the '.' to the locale specific decimal
138		   point */
139		copy = (char *)PyMem_MALLOC(end - digits_pos +
140					    1 + decimal_point_len);
141		if (copy == NULL) {
142			if (endptr)
143				*endptr = (char *)nptr;
144			errno = ENOMEM;
145			return val;
146		}
147
148		c = copy;
149		memcpy(c, digits_pos, decimal_point_pos - digits_pos);
150		c += decimal_point_pos - digits_pos;
151		memcpy(c, decimal_point, decimal_point_len);
152		c += decimal_point_len;
153		memcpy(c, decimal_point_pos + 1,
154		       end - (decimal_point_pos + 1));
155		c += end - (decimal_point_pos + 1);
156		*c = 0;
157
158		val = strtod(copy, &fail_pos);
159
160		if (fail_pos)
161		{
162			if (fail_pos > decimal_point_pos)
163				fail_pos = (char *)digits_pos +
164					(fail_pos - copy) -
165					(decimal_point_len - 1);
166			else
167				fail_pos = (char *)digits_pos +
168					(fail_pos - copy);
169		}
170
171		PyMem_FREE(copy);
172
173	}
174	else {
175		val = strtod(digits_pos, &fail_pos);
176	}
177
178	if (fail_pos == digits_pos)
179		fail_pos = (char *)nptr;
180
181	if (negate && fail_pos != nptr)
182		val = -val;
183
184	if (endptr)
185		*endptr = fail_pos;
186
187	return val;
188}
189
190/* Given a string that may have a decimal point in the current
191   locale, change it back to a dot.  Since the string cannot get
192   longer, no need for a maximum buffer size parameter. */
193Py_LOCAL_INLINE(void)
194change_decimal_from_locale_to_dot(char* buffer)
195{
196	struct lconv *locale_data = localeconv();
197	const char *decimal_point = locale_data->decimal_point;
198
199	if (decimal_point[0] != '.' || decimal_point[1] != 0) {
200		size_t decimal_point_len = strlen(decimal_point);
201
202		if (*buffer == '+' || *buffer == '-')
203			buffer++;
204		while (isdigit(Py_CHARMASK(*buffer)))
205			buffer++;
206		if (strncmp(buffer, decimal_point, decimal_point_len) == 0) {
207			*buffer = '.';
208			buffer++;
209			if (decimal_point_len > 1) {
210				/* buffer needs to get smaller */
211				size_t rest_len = strlen(buffer +
212						     (decimal_point_len - 1));
213				memmove(buffer,
214					buffer + (decimal_point_len - 1),
215					rest_len);
216				buffer[rest_len] = 0;
217			}
218		}
219	}
220}
221
222
223/* From the C99 standard, section 7.19.6:
224The exponent always contains at least two digits, and only as many more digits
225as necessary to represent the exponent.
226*/
227#define MIN_EXPONENT_DIGITS 2
228
229/* Ensure that any exponent, if present, is at least MIN_EXPONENT_DIGITS
230   in length. */
231Py_LOCAL_INLINE(void)
232ensure_minimum_exponent_length(char* buffer, size_t buf_size)
233{
234	char *p = strpbrk(buffer, "eE");
235	if (p && (*(p + 1) == '-' || *(p + 1) == '+')) {
236		char *start = p + 2;
237		int exponent_digit_cnt = 0;
238		int leading_zero_cnt = 0;
239		int in_leading_zeros = 1;
240		int significant_digit_cnt;
241
242		/* Skip over the exponent and the sign. */
243		p += 2;
244
245		/* Find the end of the exponent, keeping track of leading
246		   zeros. */
247		while (*p && isdigit(Py_CHARMASK(*p))) {
248			if (in_leading_zeros && *p == '0')
249				++leading_zero_cnt;
250			if (*p != '0')
251				in_leading_zeros = 0;
252			++p;
253			++exponent_digit_cnt;
254		}
255
256		significant_digit_cnt = exponent_digit_cnt - leading_zero_cnt;
257		if (exponent_digit_cnt == MIN_EXPONENT_DIGITS) {
258			/* If there are 2 exactly digits, we're done,
259			   regardless of what they contain */
260		}
261		else if (exponent_digit_cnt > MIN_EXPONENT_DIGITS) {
262			int extra_zeros_cnt;
263
264			/* There are more than 2 digits in the exponent.  See
265			   if we can delete some of the leading zeros */
266			if (significant_digit_cnt < MIN_EXPONENT_DIGITS)
267				significant_digit_cnt = MIN_EXPONENT_DIGITS;
268			extra_zeros_cnt = exponent_digit_cnt -
269				significant_digit_cnt;
270
271			/* Delete extra_zeros_cnt worth of characters from the
272			   front of the exponent */
273			assert(extra_zeros_cnt >= 0);
274
275			/* Add one to significant_digit_cnt to copy the
276			   trailing 0 byte, thus setting the length */
277			memmove(start,
278				start + extra_zeros_cnt,
279				significant_digit_cnt + 1);
280		}
281		else {
282			/* If there are fewer than 2 digits, add zeros
283			   until there are 2, if there's enough room */
284			int zeros = MIN_EXPONENT_DIGITS - exponent_digit_cnt;
285			if (start + zeros + exponent_digit_cnt + 1
286			      < buffer + buf_size) {
287				memmove(start + zeros, start,
288					exponent_digit_cnt + 1);
289				memset(start, '0', zeros);
290			}
291		}
292	}
293}
294
295/* Ensure that buffer has a decimal point in it.  The decimal point
296   will not be in the current locale, it will always be '.' */
297Py_LOCAL_INLINE(void)
298ensure_decimal_point(char* buffer, size_t buf_size)
299{
300	int insert_count = 0;
301	char* chars_to_insert;
302
303	/* search for the first non-digit character */
304	char *p = buffer;
305	if (*p == '-' || *p == '+')
306		/* Skip leading sign, if present.  I think this could only
307		   ever be '-', but it can't hurt to check for both. */
308		++p;
309	while (*p && isdigit(Py_CHARMASK(*p)))
310		++p;
311
312	if (*p == '.') {
313		if (isdigit(Py_CHARMASK(*(p+1)))) {
314			/* Nothing to do, we already have a decimal
315			   point and a digit after it */
316		}
317		else {
318			/* We have a decimal point, but no following
319			   digit.  Insert a zero after the decimal. */
320			++p;
321			chars_to_insert = "0";
322			insert_count = 1;
323		}
324	}
325	else {
326		chars_to_insert = ".0";
327		insert_count = 2;
328	}
329	if (insert_count) {
330		size_t buf_len = strlen(buffer);
331		if (buf_len + insert_count + 1 >= buf_size) {
332			/* If there is not enough room in the buffer
333			   for the additional text, just skip it.  It's
334			   not worth generating an error over. */
335		}
336		else {
337			memmove(p + insert_count, p,
338				buffer + strlen(buffer) - p + 1);
339			memcpy(p, chars_to_insert, insert_count);
340		}
341	}
342}
343
344/* Add the locale specific grouping characters to buffer.  Note
345   that any decimal point (if it's present) in buffer is already
346   locale-specific.  Return 0 on error, else 1. */
347Py_LOCAL_INLINE(int)
348add_thousands_grouping(char* buffer, size_t buf_size)
349{
350	Py_ssize_t len = strlen(buffer);
351	struct lconv *locale_data = localeconv();
352	const char *decimal_point = locale_data->decimal_point;
353
354	/* Find the decimal point, if any.  We're only concerned
355	   about the characters to the left of the decimal when
356	   adding grouping. */
357	char *p = strstr(buffer, decimal_point);
358	if (!p) {
359		/* No decimal, use the entire string. */
360
361		/* If any exponent, adjust p. */
362		p = strpbrk(buffer, "eE");
363		if (!p)
364			/* No exponent and no decimal.  Use the entire
365			   string. */
366			p = buffer + len;
367	}
368	/* At this point, p points just past the right-most character we
369	   want to format.  We need to add the grouping string for the
370	   characters between buffer and p. */
371	return _PyString_InsertThousandsGrouping(buffer, len, p-buffer,
372						 buf_size, NULL, 1);
373}
374
375/* see FORMATBUFLEN in unicodeobject.c */
376#define FLOAT_FORMATBUFLEN 120
377
378/**
379 * PyOS_ascii_formatd:
380 * @buffer: A buffer to place the resulting string in
381 * @buf_size: The length of the buffer.
382 * @format: The printf()-style format to use for the
383 *          code to use for converting. 
384 * @d: The #gdouble to convert
385 *
386 * Converts a #gdouble to a string, using the '.' as
387 * decimal point. To format the number you pass in
388 * a printf()-style format string. Allowed conversion
389 * specifiers are 'e', 'E', 'f', 'F', 'g', 'G', and 'n'.
390 * 
391 * 'n' is the same as 'g', except it uses the current locale.
392 * 'Z' is the same as 'g', except it always has a decimal and
393 *     at least one digit after the decimal.
394 *
395 * Return value: The pointer to the buffer with the converted string.
396 **/
397char *
398PyOS_ascii_formatd(char       *buffer, 
399		   size_t      buf_size, 
400		   const char *format, 
401		   double      d)
402{
403	char format_char;
404	size_t format_len = strlen(format);
405
406	/* For type 'n', we need to make a copy of the format string, because
407	   we're going to modify 'n' -> 'g', and format is const char*, so we
408	   can't modify it directly.  FLOAT_FORMATBUFLEN should be longer than
409	   we ever need this to be.  There's an upcoming check to ensure it's
410	   big enough. */
411	/* Issue 2264: code 'Z' requires copying the format.  'Z' is 'g', but
412	   also with at least one character past the decimal. */
413	char tmp_format[FLOAT_FORMATBUFLEN];
414
415	/* The last character in the format string must be the format char */
416	format_char = format[format_len - 1];
417
418	if (format[0] != '%')
419		return NULL;
420
421	/* I'm not sure why this test is here.  It's ensuring that the format
422	   string after the first character doesn't have a single quote, a
423	   lowercase l, or a percent. This is the reverse of the commented-out
424	   test about 10 lines ago. */
425	if (strpbrk(format + 1, "'l%"))
426		return NULL;
427
428	/* Also curious about this function is that it accepts format strings
429	   like "%xg", which are invalid for floats.  In general, the
430	   interface to this function is not very good, but changing it is
431	   difficult because it's a public API. */
432
433	if (!(format_char == 'e' || format_char == 'E' || 
434	      format_char == 'f' || format_char == 'F' || 
435	      format_char == 'g' || format_char == 'G' ||
436	      format_char == 'n' || format_char == 'Z'))
437		return NULL;
438
439	/* Map 'n' or 'Z' format_char to 'g', by copying the format string and
440	   replacing the final char with a 'g' */
441	if (format_char == 'n' || format_char == 'Z') {
442		if (format_len + 1 >= sizeof(tmp_format)) {
443			/* The format won't fit in our copy.  Error out.  In
444			   practice, this will never happen and will be
445			   detected by returning NULL */
446			return NULL;
447		}
448		strcpy(tmp_format, format);
449		tmp_format[format_len - 1] = 'g';
450		format = tmp_format;
451	}
452
453
454	/* Have PyOS_snprintf do the hard work */
455	PyOS_snprintf(buffer, buf_size, format, d);
456
457	/* Do various fixups on the return string */
458
459	/* Get the current locale, and find the decimal point string.
460	   Convert that string back to a dot.  Do not do this if using the
461	   'n' (number) format code, since we want to keep the localized
462	   decimal point in that case. */
463	if (format_char != 'n')
464		change_decimal_from_locale_to_dot(buffer);
465
466	/* If an exponent exists, ensure that the exponent is at least
467	   MIN_EXPONENT_DIGITS digits, providing the buffer is large enough
468	   for the extra zeros.  Also, if there are more than
469	   MIN_EXPONENT_DIGITS, remove as many zeros as possible until we get
470	   back to MIN_EXPONENT_DIGITS */
471	ensure_minimum_exponent_length(buffer, buf_size);
472
473	/* If format_char is 'Z', make sure we have at least one character
474	   after the decimal point (and make sure we have a decimal point). */
475	if (format_char == 'Z')
476		ensure_decimal_point(buffer, buf_size);
477
478	/* If format_char is 'n', add the thousands grouping. */
479	if (format_char == 'n')
480		if (!add_thousands_grouping(buffer, buf_size))
481			return NULL;
482
483	return buffer;
484}
485
486double
487PyOS_ascii_atof(const char *nptr)
488{
489	return PyOS_ascii_strtod(nptr, NULL);
490}