PageRenderTime 143ms CodeModel.GetById 11ms app.highlight 127ms RepoModel.GetById 1ms app.codeStats 0ms

/core/externals/update-engine/externals/google-toolbox-for-mac/Foundation/GTMNSString+HTML.m

http://macfuse.googlecode.com/
Objective C | 522 lines | 431 code | 37 blank | 54 comment | 45 complexity | 9e43339edaea9d54c39f10525f653d76 MD5 | raw file
  1//
  2//  GTMNSString+HTML.m
  3//  Dealing with NSStrings that contain HTML
  4//
  5//  Copyright 2006-2008 Google Inc.
  6//
  7//  Licensed under the Apache License, Version 2.0 (the "License"); you may not
  8//  use this file except in compliance with the License.  You may obtain a copy
  9//  of the License at
 10// 
 11//  http://www.apache.org/licenses/LICENSE-2.0
 12// 
 13//  Unless required by applicable law or agreed to in writing, software
 14//  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 15//  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 16//  License for the specific language governing permissions and limitations under
 17//  the License.
 18//
 19
 20#import "GTMDefines.h"
 21#import "GTMNSString+HTML.h"
 22
 23typedef struct {
 24  NSString *escapeSequence;
 25  unichar uchar;
 26} HTMLEscapeMap;
 27
 28// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
 29// Ordered by uchar lowest to highest for bsearching
 30static HTMLEscapeMap gAsciiHTMLEscapeMap[] = {
 31  // A.2.2. Special characters
 32  { @""", 34 },
 33  { @"&", 38 },
 34  { @"'", 39 },
 35  { @"<", 60 },
 36  { @">", 62 },
 37  
 38    // A.2.1. Latin-1 characters
 39  { @" ", 160 }, 
 40  { @"¡", 161 }, 
 41  { @"¢", 162 }, 
 42  { @"£", 163 }, 
 43  { @"¤", 164 }, 
 44  { @"¥", 165 }, 
 45  { @"¦", 166 }, 
 46  { @"§", 167 }, 
 47  { @"¨", 168 }, 
 48  { @"©", 169 }, 
 49  { @"ª", 170 }, 
 50  { @"«", 171 }, 
 51  { @"¬", 172 }, 
 52  { @"­", 173 }, 
 53  { @"®", 174 }, 
 54  { @"¯", 175 }, 
 55  { @"°", 176 }, 
 56  { @"±", 177 }, 
 57  { @"²", 178 }, 
 58  { @"³", 179 }, 
 59  { @"´", 180 }, 
 60  { @"µ", 181 }, 
 61  { @"¶", 182 }, 
 62  { @"·", 183 }, 
 63  { @"¸", 184 }, 
 64  { @"¹", 185 }, 
 65  { @"º", 186 }, 
 66  { @"»", 187 }, 
 67  { @"¼", 188 }, 
 68  { @"½", 189 }, 
 69  { @"¾", 190 }, 
 70  { @"¿", 191 }, 
 71  { @"À", 192 }, 
 72  { @"Á", 193 }, 
 73  { @"Â", 194 }, 
 74  { @"Ã", 195 }, 
 75  { @"Ä", 196 }, 
 76  { @"Å", 197 }, 
 77  { @"Æ", 198 }, 
 78  { @"Ç", 199 }, 
 79  { @"È", 200 }, 
 80  { @"É", 201 }, 
 81  { @"Ê", 202 }, 
 82  { @"Ë", 203 }, 
 83  { @"Ì", 204 }, 
 84  { @"Í", 205 }, 
 85  { @"Î", 206 }, 
 86  { @"Ï", 207 }, 
 87  { @"Ð", 208 }, 
 88  { @"Ñ", 209 }, 
 89  { @"Ò", 210 }, 
 90  { @"Ó", 211 }, 
 91  { @"Ô", 212 }, 
 92  { @"Õ", 213 }, 
 93  { @"Ö", 214 }, 
 94  { @"×", 215 }, 
 95  { @"Ø", 216 }, 
 96  { @"Ù", 217 }, 
 97  { @"Ú", 218 }, 
 98  { @"Û", 219 }, 
 99  { @"Ü", 220 }, 
100  { @"Ý", 221 }, 
101  { @"Þ", 222 }, 
102  { @"ß", 223 }, 
103  { @"à", 224 }, 
104  { @"á", 225 }, 
105  { @"â", 226 }, 
106  { @"ã", 227 }, 
107  { @"ä", 228 }, 
108  { @"å", 229 }, 
109  { @"æ", 230 }, 
110  { @"ç", 231 }, 
111  { @"è", 232 }, 
112  { @"é", 233 }, 
113  { @"ê", 234 }, 
114  { @"ë", 235 }, 
115  { @"ì", 236 }, 
116  { @"í", 237 }, 
117  { @"î", 238 }, 
118  { @"ï", 239 }, 
119  { @"ð", 240 }, 
120  { @"ñ", 241 }, 
121  { @"ò", 242 }, 
122  { @"ó", 243 }, 
123  { @"ô", 244 }, 
124  { @"õ", 245 }, 
125  { @"ö", 246 }, 
126  { @"÷", 247 }, 
127  { @"ø", 248 }, 
128  { @"ù", 249 }, 
129  { @"ú", 250 }, 
130  { @"û", 251 }, 
131  { @"ü", 252 }, 
132  { @"ý", 253 }, 
133  { @"þ", 254 }, 
134  { @"ÿ", 255 },
135  
136  // A.2.2. Special characters cont'd
137  { @"Œ", 338 },
138  { @"œ", 339 },
139  { @"Š", 352 },
140  { @"š", 353 },
141  { @"Ÿ", 376 },
142
143  // A.2.3. Symbols
144  { @"ƒ", 402 }, 
145
146  // A.2.2. Special characters cont'd
147  { @"ˆ", 710 },
148  { @"˜", 732 },
149  
150  // A.2.3. Symbols cont'd
151  { @"Α", 913 }, 
152  { @"Β", 914 }, 
153  { @"Γ", 915 }, 
154  { @"Δ", 916 }, 
155  { @"Ε", 917 }, 
156  { @"Ζ", 918 }, 
157  { @"Η", 919 }, 
158  { @"Θ", 920 }, 
159  { @"Ι", 921 }, 
160  { @"Κ", 922 }, 
161  { @"Λ", 923 }, 
162  { @"Μ", 924 }, 
163  { @"Ν", 925 }, 
164  { @"Ξ", 926 }, 
165  { @"Ο", 927 }, 
166  { @"Π", 928 }, 
167  { @"Ρ", 929 }, 
168  { @"Σ", 931 }, 
169  { @"Τ", 932 }, 
170  { @"Υ", 933 }, 
171  { @"Φ", 934 }, 
172  { @"Χ", 935 }, 
173  { @"Ψ", 936 }, 
174  { @"Ω", 937 }, 
175  { @"α", 945 }, 
176  { @"β", 946 }, 
177  { @"γ", 947 }, 
178  { @"δ", 948 }, 
179  { @"ε", 949 }, 
180  { @"ζ", 950 }, 
181  { @"η", 951 }, 
182  { @"θ", 952 }, 
183  { @"ι", 953 }, 
184  { @"κ", 954 }, 
185  { @"λ", 955 }, 
186  { @"μ", 956 }, 
187  { @"ν", 957 }, 
188  { @"ξ", 958 }, 
189  { @"ο", 959 }, 
190  { @"π", 960 }, 
191  { @"ρ", 961 }, 
192  { @"ς", 962 }, 
193  { @"σ", 963 }, 
194  { @"τ", 964 }, 
195  { @"υ", 965 }, 
196  { @"φ", 966 }, 
197  { @"χ", 967 }, 
198  { @"ψ", 968 }, 
199  { @"ω", 969 }, 
200  { @"ϑ", 977 }, 
201  { @"ϒ", 978 }, 
202  { @"ϖ", 982 }, 
203 
204  // A.2.2. Special characters cont'd
205  { @" ", 8194 },
206  { @" ", 8195 },
207  { @" ", 8201 },
208  { @"‌", 8204 },
209  { @"‍", 8205 },
210  { @"‎", 8206 },
211  { @"‏", 8207 },
212  { @"–", 8211 },
213  { @"—", 8212 },
214  { @"‘", 8216 },
215  { @"’", 8217 },
216  { @"‚", 8218 },
217  { @"“", 8220 },
218  { @"”", 8221 },
219  { @"„", 8222 },
220  { @"†", 8224 },
221  { @"‡", 8225 },
222    // A.2.3. Symbols cont'd  
223  { @"•", 8226 }, 
224  { @"…", 8230 }, 
225 
226  // A.2.2. Special characters cont'd
227  { @"‰", 8240 },
228  
229  // A.2.3. Symbols cont'd  
230  { @"′", 8242 }, 
231  { @"″", 8243 }, 
232
233  // A.2.2. Special characters cont'd
234  { @"‹", 8249 },
235  { @"›", 8250 },
236
237  // A.2.3. Symbols cont'd  
238  { @"‾", 8254 }, 
239  { @"⁄", 8260 }, 
240  
241  // A.2.2. Special characters cont'd
242  { @"€", 8364 },
243
244  // A.2.3. Symbols cont'd  
245  { @"ℑ", 8465 },
246  { @"℘", 8472 }, 
247  { @"ℜ", 8476 }, 
248  { @"™", 8482 }, 
249  { @"ℵ", 8501 }, 
250  { @"←", 8592 }, 
251  { @"↑", 8593 }, 
252  { @"→", 8594 }, 
253  { @"↓", 8595 }, 
254  { @"↔", 8596 }, 
255  { @"↵", 8629 }, 
256  { @"⇐", 8656 }, 
257  { @"⇑", 8657 }, 
258  { @"⇒", 8658 }, 
259  { @"⇓", 8659 }, 
260  { @"⇔", 8660 }, 
261  { @"∀", 8704 }, 
262  { @"∂", 8706 }, 
263  { @"∃", 8707 }, 
264  { @"∅", 8709 }, 
265  { @"∇", 8711 }, 
266  { @"∈", 8712 }, 
267  { @"∉", 8713 }, 
268  { @"∋", 8715 }, 
269  { @"∏", 8719 }, 
270  { @"∑", 8721 }, 
271  { @"−", 8722 }, 
272  { @"∗", 8727 }, 
273  { @"√", 8730 }, 
274  { @"∝", 8733 }, 
275  { @"∞", 8734 }, 
276  { @"∠", 8736 }, 
277  { @"∧", 8743 }, 
278  { @"∨", 8744 }, 
279  { @"∩", 8745 }, 
280  { @"∪", 8746 }, 
281  { @"∫", 8747 }, 
282  { @"∴", 8756 }, 
283  { @"∼", 8764 }, 
284  { @"≅", 8773 }, 
285  { @"≈", 8776 }, 
286  { @"≠", 8800 }, 
287  { @"≡", 8801 }, 
288  { @"≤", 8804 }, 
289  { @"≥", 8805 }, 
290  { @"⊂", 8834 }, 
291  { @"⊃", 8835 }, 
292  { @"⊄", 8836 }, 
293  { @"⊆", 8838 }, 
294  { @"⊇", 8839 }, 
295  { @"⊕", 8853 }, 
296  { @"⊗", 8855 }, 
297  { @"⊥", 8869 }, 
298  { @"⋅", 8901 }, 
299  { @"⌈", 8968 }, 
300  { @"⌉", 8969 }, 
301  { @"⌊", 8970 }, 
302  { @"⌋", 8971 }, 
303  { @"⟨", 9001 }, 
304  { @"⟩", 9002 }, 
305  { @"◊", 9674 }, 
306  { @"♠", 9824 }, 
307  { @"♣", 9827 }, 
308  { @"♥", 9829 }, 
309  { @"♦", 9830 }
310};
311
312// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters
313// This is table A.2.2 Special Characters
314static HTMLEscapeMap gUnicodeHTMLEscapeMap[] = {
315  // C0 Controls and Basic Latin
316  { @""", 34 },
317  { @"&", 38 },
318  { @"'", 39 },
319  { @"<", 60 },
320  { @">", 62 },
321
322  // Latin Extended-A
323  { @"Œ", 338 },
324  { @"œ", 339 },
325  { @"Š", 352 },
326  { @"š", 353 },
327  { @"Ÿ", 376 },
328  
329  // Spacing Modifier Letters
330  { @"ˆ", 710 },
331  { @"˜", 732 },
332    
333  // General Punctuation
334  { @" ", 8194 },
335  { @" ", 8195 },
336  { @" ", 8201 },
337  { @"‌", 8204 },
338  { @"‍", 8205 },
339  { @"‎", 8206 },
340  { @"‏", 8207 },
341  { @"–", 8211 },
342  { @"—", 8212 },
343  { @"‘", 8216 },
344  { @"’", 8217 },
345  { @"‚", 8218 },
346  { @"“", 8220 },
347  { @"”", 8221 },
348  { @"„", 8222 },
349  { @"†", 8224 },
350  { @"‡", 8225 },
351  { @"‰", 8240 },
352  { @"‹", 8249 },
353  { @"›", 8250 },
354  { @"€", 8364 },
355};
356
357
358// Utility function for Bsearching table above
359static int EscapeMapCompare(const void *ucharVoid, const void *mapVoid) {
360  const unichar *uchar = (const unichar*)ucharVoid;
361  const HTMLEscapeMap *map = (const HTMLEscapeMap*)mapVoid;
362  int val;
363  if (*uchar > map->uchar) {
364    val = 1;
365  } else if (*uchar < map->uchar) {
366    val = -1;
367  } else {
368    val = 0;
369  }
370  return val;
371}
372
373@implementation NSString (GTMNSStringHTMLAdditions)
374
375- (NSString *)gtm_stringByEscapingHTMLUsingTable:(HTMLEscapeMap*)table 
376                                          ofSize:(NSUInteger)size 
377                                 escapingUnicode:(BOOL)escapeUnicode {  
378  NSUInteger length = [self length];
379  if (!length) {
380    return self;
381  }
382  
383  NSMutableString *finalString = [NSMutableString string];
384  NSMutableData *data2 = [NSMutableData dataWithCapacity:sizeof(unichar) * length];
385
386  // this block is common between GTMNSString+HTML and GTMNSString+XML but
387  // it's so short that it isn't really worth trying to share.
388  const unichar *buffer = CFStringGetCharactersPtr((CFStringRef)self);
389  if (!buffer) {
390    // We want this buffer to be autoreleased.
391    NSMutableData *data = [NSMutableData dataWithLength:length * sizeof(UniChar)];
392    if (!data) {
393      // COV_NF_START  - Memory fail case
394      _GTMDevLog(@"couldn't alloc buffer");
395      return nil;
396      // COV_NF_END
397    }
398    [self getCharacters:[data mutableBytes]];
399    buffer = [data bytes];
400  }
401
402  if (!buffer || !data2) {
403    // COV_NF_START
404    _GTMDevLog(@"Unable to allocate buffer or data2");
405    return nil;
406    // COV_NF_END
407  }
408  
409  unichar *buffer2 = (unichar *)[data2 mutableBytes];
410  
411  NSUInteger buffer2Length = 0;
412  
413  for (NSUInteger i = 0; i < length; ++i) {
414    HTMLEscapeMap *val = bsearch(&buffer[i], table, 
415                                 size / sizeof(HTMLEscapeMap), 
416                                 sizeof(HTMLEscapeMap), EscapeMapCompare);
417    if (val || (escapeUnicode && buffer[i] > 127)) {
418      if (buffer2Length) {
419        CFStringAppendCharacters((CFMutableStringRef)finalString, 
420                                 buffer2, 
421                                 buffer2Length);
422        buffer2Length = 0;
423      }
424      if (val) {
425        [finalString appendString:val->escapeSequence];
426      }
427      else {
428        _GTMDevAssert(escapeUnicode && buffer[i] > 127, @"Illegal Character");
429        [finalString appendFormat:@"&#%d;", buffer[i]];
430      }
431    } else {
432      buffer2[buffer2Length] = buffer[i];
433      buffer2Length += 1;
434    }
435  }
436  if (buffer2Length) {
437    CFStringAppendCharacters((CFMutableStringRef)finalString, 
438                             buffer2, 
439                             buffer2Length);
440  }
441  return finalString;
442}
443
444- (NSString *)gtm_stringByEscapingForHTML {
445  return [self gtm_stringByEscapingHTMLUsingTable:gUnicodeHTMLEscapeMap 
446                                           ofSize:sizeof(gUnicodeHTMLEscapeMap) 
447                                  escapingUnicode:NO];
448} // gtm_stringByEscapingHTML
449
450- (NSString *)gtm_stringByEscapingForAsciiHTML {
451  return [self gtm_stringByEscapingHTMLUsingTable:gAsciiHTMLEscapeMap 
452                                           ofSize:sizeof(gAsciiHTMLEscapeMap) 
453                                  escapingUnicode:YES];
454} // gtm_stringByEscapingAsciiHTML
455
456- (NSString *)gtm_stringByUnescapingFromHTML {
457  NSRange range = NSMakeRange(0, [self length]);
458  NSRange subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range];
459  
460  // if no ampersands, we've got a quick way out
461  if (subrange.length == 0) return self;
462  NSMutableString *finalString = [NSMutableString stringWithString:self];
463  do {
464    NSRange semiColonRange = NSMakeRange(subrange.location, NSMaxRange(range) - subrange.location);
465    semiColonRange = [self rangeOfString:@";" options:0 range:semiColonRange];
466    range = NSMakeRange(0, subrange.location);
467    // if we don't find a semicolon in the range, we don't have a sequence
468    if (semiColonRange.location == NSNotFound) {
469      continue;
470    }
471    NSRange escapeRange = NSMakeRange(subrange.location, semiColonRange.location - subrange.location + 1);
472    NSString *escapeString = [self substringWithRange:escapeRange];
473    NSUInteger length = [escapeString length];
474    // a squence must be longer than 3 (&lt;) and less than 11 (&thetasym;)
475    if (length > 3 && length < 11) {
476      if ([escapeString characterAtIndex:1] == '#') {
477        unichar char2 = [escapeString characterAtIndex:2];
478        if (char2 == 'x' || char2 == 'X') {
479          // Hex escape squences &#xa3;
480          NSString *hexSequence = [escapeString substringWithRange:NSMakeRange(3, length - 4)];
481          NSScanner *scanner = [NSScanner scannerWithString:hexSequence];
482          unsigned value;
483          if ([scanner scanHexInt:&value] && 
484              value < USHRT_MAX &&
485              value > 0 
486              && [scanner scanLocation] == length - 4) {
487            unichar uchar = (unichar)value;
488            NSString *charString = [NSString stringWithCharacters:&uchar length:1];
489            [finalString replaceCharactersInRange:escapeRange withString:charString];
490          }
491
492        } else {
493          // Decimal Sequences &#123;
494          NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)];
495          NSScanner *scanner = [NSScanner scannerWithString:numberSequence];
496          int value;
497          if ([scanner scanInt:&value] && 
498              value < USHRT_MAX &&
499              value > 0 
500              && [scanner scanLocation] == length - 3) {
501            unichar uchar = (unichar)value;
502            NSString *charString = [NSString stringWithCharacters:&uchar length:1];
503            [finalString replaceCharactersInRange:escapeRange withString:charString];
504          }
505        }
506      } else {
507        // "standard" sequences
508        for (unsigned i = 0; i < sizeof(gAsciiHTMLEscapeMap) / sizeof(HTMLEscapeMap); ++i) {
509          if ([escapeString isEqualToString:gAsciiHTMLEscapeMap[i].escapeSequence]) {
510            [finalString replaceCharactersInRange:escapeRange withString:[NSString stringWithCharacters:&gAsciiHTMLEscapeMap[i].uchar length:1]];
511            break;
512          }
513        }
514      }
515    }
516  } while ((subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]).length != 0);
517  return finalString;
518} // gtm_stringByUnescapingHTML
519
520
521
522@end