/core/externals/update-engine/externals/google-toolbox-for-mac/Foundation/GTMNSString+HTML.m
Objective C | 522 lines | 431 code | 37 blank | 54 comment | 45 complexity | 9e43339edaea9d54c39f10525f653d76 MD5 | raw file
1// 2// GTMNSString+HTML.m 3// Dealing with NSStrings that contain HTML 4// 5// Copyright 2006-2008 Google Inc. 6// 7// Licensed under the Apache License, Version 2.0 (the "License"); you may not 8// use this file except in compliance with the License. You may obtain a copy 9// of the License at 10// 11// http://www.apache.org/licenses/LICENSE-2.0 12// 13// Unless required by applicable law or agreed to in writing, software 14// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 15// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 16// License for the specific language governing permissions and limitations under 17// the License. 18// 19 20#import "GTMDefines.h" 21#import "GTMNSString+HTML.h" 22 23typedef struct { 24 NSString *escapeSequence; 25 unichar uchar; 26} HTMLEscapeMap; 27 28// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters 29// Ordered by uchar lowest to highest for bsearching 30static HTMLEscapeMap gAsciiHTMLEscapeMap[] = { 31 // A.2.2. Special characters 32 { @""", 34 }, 33 { @"&", 38 }, 34 { @"'", 39 }, 35 { @"<", 60 }, 36 { @">", 62 }, 37 38 // A.2.1. Latin-1 characters 39 { @" ", 160 }, 40 { @"¡", 161 }, 41 { @"¢", 162 }, 42 { @"£", 163 }, 43 { @"¤", 164 }, 44 { @"¥", 165 }, 45 { @"¦", 166 }, 46 { @"§", 167 }, 47 { @"¨", 168 }, 48 { @"©", 169 }, 49 { @"ª", 170 }, 50 { @"«", 171 }, 51 { @"¬", 172 }, 52 { @"­", 173 }, 53 { @"®", 174 }, 54 { @"¯", 175 }, 55 { @"°", 176 }, 56 { @"±", 177 }, 57 { @"²", 178 }, 58 { @"³", 179 }, 59 { @"´", 180 }, 60 { @"µ", 181 }, 61 { @"¶", 182 }, 62 { @"·", 183 }, 63 { @"¸", 184 }, 64 { @"¹", 185 }, 65 { @"º", 186 }, 66 { @"»", 187 }, 67 { @"¼", 188 }, 68 { @"½", 189 }, 69 { @"¾", 190 }, 70 { @"¿", 191 }, 71 { @"À", 192 }, 72 { @"Á", 193 }, 73 { @"Â", 194 }, 74 { @"Ã", 195 }, 75 { @"Ä", 196 }, 76 { @"Å", 197 }, 77 { @"Æ", 198 }, 78 { @"Ç", 199 }, 79 { @"È", 200 }, 80 { @"É", 201 }, 81 { @"Ê", 202 }, 82 { @"Ë", 203 }, 83 { @"Ì", 204 }, 84 { @"Í", 205 }, 85 { @"Î", 206 }, 86 { @"Ï", 207 }, 87 { @"Ð", 208 }, 88 { @"Ñ", 209 }, 89 { @"Ò", 210 }, 90 { @"Ó", 211 }, 91 { @"Ô", 212 }, 92 { @"Õ", 213 }, 93 { @"Ö", 214 }, 94 { @"×", 215 }, 95 { @"Ø", 216 }, 96 { @"Ù", 217 }, 97 { @"Ú", 218 }, 98 { @"Û", 219 }, 99 { @"Ü", 220 }, 100 { @"Ý", 221 }, 101 { @"Þ", 222 }, 102 { @"ß", 223 }, 103 { @"à", 224 }, 104 { @"á", 225 }, 105 { @"â", 226 }, 106 { @"ã", 227 }, 107 { @"ä", 228 }, 108 { @"å", 229 }, 109 { @"æ", 230 }, 110 { @"ç", 231 }, 111 { @"è", 232 }, 112 { @"é", 233 }, 113 { @"ê", 234 }, 114 { @"ë", 235 }, 115 { @"ì", 236 }, 116 { @"í", 237 }, 117 { @"î", 238 }, 118 { @"ï", 239 }, 119 { @"ð", 240 }, 120 { @"ñ", 241 }, 121 { @"ò", 242 }, 122 { @"ó", 243 }, 123 { @"ô", 244 }, 124 { @"õ", 245 }, 125 { @"ö", 246 }, 126 { @"÷", 247 }, 127 { @"ø", 248 }, 128 { @"ù", 249 }, 129 { @"ú", 250 }, 130 { @"û", 251 }, 131 { @"ü", 252 }, 132 { @"ý", 253 }, 133 { @"þ", 254 }, 134 { @"ÿ", 255 }, 135 136 // A.2.2. Special characters cont'd 137 { @"Œ", 338 }, 138 { @"œ", 339 }, 139 { @"Š", 352 }, 140 { @"š", 353 }, 141 { @"Ÿ", 376 }, 142 143 // A.2.3. Symbols 144 { @"ƒ", 402 }, 145 146 // A.2.2. Special characters cont'd 147 { @"ˆ", 710 }, 148 { @"˜", 732 }, 149 150 // A.2.3. Symbols cont'd 151 { @"Α", 913 }, 152 { @"Β", 914 }, 153 { @"Γ", 915 }, 154 { @"Δ", 916 }, 155 { @"Ε", 917 }, 156 { @"Ζ", 918 }, 157 { @"Η", 919 }, 158 { @"Θ", 920 }, 159 { @"Ι", 921 }, 160 { @"Κ", 922 }, 161 { @"Λ", 923 }, 162 { @"Μ", 924 }, 163 { @"Ν", 925 }, 164 { @"Ξ", 926 }, 165 { @"Ο", 927 }, 166 { @"Π", 928 }, 167 { @"Ρ", 929 }, 168 { @"Σ", 931 }, 169 { @"Τ", 932 }, 170 { @"Υ", 933 }, 171 { @"Φ", 934 }, 172 { @"Χ", 935 }, 173 { @"Ψ", 936 }, 174 { @"Ω", 937 }, 175 { @"α", 945 }, 176 { @"β", 946 }, 177 { @"γ", 947 }, 178 { @"δ", 948 }, 179 { @"ε", 949 }, 180 { @"ζ", 950 }, 181 { @"η", 951 }, 182 { @"θ", 952 }, 183 { @"ι", 953 }, 184 { @"κ", 954 }, 185 { @"λ", 955 }, 186 { @"μ", 956 }, 187 { @"ν", 957 }, 188 { @"ξ", 958 }, 189 { @"ο", 959 }, 190 { @"π", 960 }, 191 { @"ρ", 961 }, 192 { @"ς", 962 }, 193 { @"σ", 963 }, 194 { @"τ", 964 }, 195 { @"υ", 965 }, 196 { @"φ", 966 }, 197 { @"χ", 967 }, 198 { @"ψ", 968 }, 199 { @"ω", 969 }, 200 { @"ϑ", 977 }, 201 { @"ϒ", 978 }, 202 { @"ϖ", 982 }, 203 204 // A.2.2. Special characters cont'd 205 { @" ", 8194 }, 206 { @" ", 8195 }, 207 { @" ", 8201 }, 208 { @"‌", 8204 }, 209 { @"‍", 8205 }, 210 { @"‎", 8206 }, 211 { @"‏", 8207 }, 212 { @"–", 8211 }, 213 { @"—", 8212 }, 214 { @"‘", 8216 }, 215 { @"’", 8217 }, 216 { @"‚", 8218 }, 217 { @"“", 8220 }, 218 { @"”", 8221 }, 219 { @"„", 8222 }, 220 { @"†", 8224 }, 221 { @"‡", 8225 }, 222 // A.2.3. Symbols cont'd 223 { @"•", 8226 }, 224 { @"…", 8230 }, 225 226 // A.2.2. Special characters cont'd 227 { @"‰", 8240 }, 228 229 // A.2.3. Symbols cont'd 230 { @"′", 8242 }, 231 { @"″", 8243 }, 232 233 // A.2.2. Special characters cont'd 234 { @"‹", 8249 }, 235 { @"›", 8250 }, 236 237 // A.2.3. Symbols cont'd 238 { @"‾", 8254 }, 239 { @"⁄", 8260 }, 240 241 // A.2.2. Special characters cont'd 242 { @"€", 8364 }, 243 244 // A.2.3. Symbols cont'd 245 { @"ℑ", 8465 }, 246 { @"℘", 8472 }, 247 { @"ℜ", 8476 }, 248 { @"™", 8482 }, 249 { @"ℵ", 8501 }, 250 { @"←", 8592 }, 251 { @"↑", 8593 }, 252 { @"→", 8594 }, 253 { @"↓", 8595 }, 254 { @"↔", 8596 }, 255 { @"↵", 8629 }, 256 { @"⇐", 8656 }, 257 { @"⇑", 8657 }, 258 { @"⇒", 8658 }, 259 { @"⇓", 8659 }, 260 { @"⇔", 8660 }, 261 { @"∀", 8704 }, 262 { @"∂", 8706 }, 263 { @"∃", 8707 }, 264 { @"∅", 8709 }, 265 { @"∇", 8711 }, 266 { @"∈", 8712 }, 267 { @"∉", 8713 }, 268 { @"∋", 8715 }, 269 { @"∏", 8719 }, 270 { @"∑", 8721 }, 271 { @"−", 8722 }, 272 { @"∗", 8727 }, 273 { @"√", 8730 }, 274 { @"∝", 8733 }, 275 { @"∞", 8734 }, 276 { @"∠", 8736 }, 277 { @"∧", 8743 }, 278 { @"∨", 8744 }, 279 { @"∩", 8745 }, 280 { @"∪", 8746 }, 281 { @"∫", 8747 }, 282 { @"∴", 8756 }, 283 { @"∼", 8764 }, 284 { @"≅", 8773 }, 285 { @"≈", 8776 }, 286 { @"≠", 8800 }, 287 { @"≡", 8801 }, 288 { @"≤", 8804 }, 289 { @"≥", 8805 }, 290 { @"⊂", 8834 }, 291 { @"⊃", 8835 }, 292 { @"⊄", 8836 }, 293 { @"⊆", 8838 }, 294 { @"⊇", 8839 }, 295 { @"⊕", 8853 }, 296 { @"⊗", 8855 }, 297 { @"⊥", 8869 }, 298 { @"⋅", 8901 }, 299 { @"⌈", 8968 }, 300 { @"⌉", 8969 }, 301 { @"⌊", 8970 }, 302 { @"⌋", 8971 }, 303 { @"⟨", 9001 }, 304 { @"⟩", 9002 }, 305 { @"◊", 9674 }, 306 { @"♠", 9824 }, 307 { @"♣", 9827 }, 308 { @"♥", 9829 }, 309 { @"♦", 9830 } 310}; 311 312// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters 313// This is table A.2.2 Special Characters 314static HTMLEscapeMap gUnicodeHTMLEscapeMap[] = { 315 // C0 Controls and Basic Latin 316 { @""", 34 }, 317 { @"&", 38 }, 318 { @"'", 39 }, 319 { @"<", 60 }, 320 { @">", 62 }, 321 322 // Latin Extended-A 323 { @"Œ", 338 }, 324 { @"œ", 339 }, 325 { @"Š", 352 }, 326 { @"š", 353 }, 327 { @"Ÿ", 376 }, 328 329 // Spacing Modifier Letters 330 { @"ˆ", 710 }, 331 { @"˜", 732 }, 332 333 // General Punctuation 334 { @" ", 8194 }, 335 { @" ", 8195 }, 336 { @" ", 8201 }, 337 { @"‌", 8204 }, 338 { @"‍", 8205 }, 339 { @"‎", 8206 }, 340 { @"‏", 8207 }, 341 { @"–", 8211 }, 342 { @"—", 8212 }, 343 { @"‘", 8216 }, 344 { @"’", 8217 }, 345 { @"‚", 8218 }, 346 { @"“", 8220 }, 347 { @"”", 8221 }, 348 { @"„", 8222 }, 349 { @"†", 8224 }, 350 { @"‡", 8225 }, 351 { @"‰", 8240 }, 352 { @"‹", 8249 }, 353 { @"›", 8250 }, 354 { @"€", 8364 }, 355}; 356 357 358// Utility function for Bsearching table above 359static int EscapeMapCompare(const void *ucharVoid, const void *mapVoid) { 360 const unichar *uchar = (const unichar*)ucharVoid; 361 const HTMLEscapeMap *map = (const HTMLEscapeMap*)mapVoid; 362 int val; 363 if (*uchar > map->uchar) { 364 val = 1; 365 } else if (*uchar < map->uchar) { 366 val = -1; 367 } else { 368 val = 0; 369 } 370 return val; 371} 372 373@implementation NSString (GTMNSStringHTMLAdditions) 374 375- (NSString *)gtm_stringByEscapingHTMLUsingTable:(HTMLEscapeMap*)table 376 ofSize:(NSUInteger)size 377 escapingUnicode:(BOOL)escapeUnicode { 378 NSUInteger length = [self length]; 379 if (!length) { 380 return self; 381 } 382 383 NSMutableString *finalString = [NSMutableString string]; 384 NSMutableData *data2 = [NSMutableData dataWithCapacity:sizeof(unichar) * length]; 385 386 // this block is common between GTMNSString+HTML and GTMNSString+XML but 387 // it's so short that it isn't really worth trying to share. 388 const unichar *buffer = CFStringGetCharactersPtr((CFStringRef)self); 389 if (!buffer) { 390 // We want this buffer to be autoreleased. 391 NSMutableData *data = [NSMutableData dataWithLength:length * sizeof(UniChar)]; 392 if (!data) { 393 // COV_NF_START - Memory fail case 394 _GTMDevLog(@"couldn't alloc buffer"); 395 return nil; 396 // COV_NF_END 397 } 398 [self getCharacters:[data mutableBytes]]; 399 buffer = [data bytes]; 400 } 401 402 if (!buffer || !data2) { 403 // COV_NF_START 404 _GTMDevLog(@"Unable to allocate buffer or data2"); 405 return nil; 406 // COV_NF_END 407 } 408 409 unichar *buffer2 = (unichar *)[data2 mutableBytes]; 410 411 NSUInteger buffer2Length = 0; 412 413 for (NSUInteger i = 0; i < length; ++i) { 414 HTMLEscapeMap *val = bsearch(&buffer[i], table, 415 size / sizeof(HTMLEscapeMap), 416 sizeof(HTMLEscapeMap), EscapeMapCompare); 417 if (val || (escapeUnicode && buffer[i] > 127)) { 418 if (buffer2Length) { 419 CFStringAppendCharacters((CFMutableStringRef)finalString, 420 buffer2, 421 buffer2Length); 422 buffer2Length = 0; 423 } 424 if (val) { 425 [finalString appendString:val->escapeSequence]; 426 } 427 else { 428 _GTMDevAssert(escapeUnicode && buffer[i] > 127, @"Illegal Character"); 429 [finalString appendFormat:@"&#%d;", buffer[i]]; 430 } 431 } else { 432 buffer2[buffer2Length] = buffer[i]; 433 buffer2Length += 1; 434 } 435 } 436 if (buffer2Length) { 437 CFStringAppendCharacters((CFMutableStringRef)finalString, 438 buffer2, 439 buffer2Length); 440 } 441 return finalString; 442} 443 444- (NSString *)gtm_stringByEscapingForHTML { 445 return [self gtm_stringByEscapingHTMLUsingTable:gUnicodeHTMLEscapeMap 446 ofSize:sizeof(gUnicodeHTMLEscapeMap) 447 escapingUnicode:NO]; 448} // gtm_stringByEscapingHTML 449 450- (NSString *)gtm_stringByEscapingForAsciiHTML { 451 return [self gtm_stringByEscapingHTMLUsingTable:gAsciiHTMLEscapeMap 452 ofSize:sizeof(gAsciiHTMLEscapeMap) 453 escapingUnicode:YES]; 454} // gtm_stringByEscapingAsciiHTML 455 456- (NSString *)gtm_stringByUnescapingFromHTML { 457 NSRange range = NSMakeRange(0, [self length]); 458 NSRange subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]; 459 460 // if no ampersands, we've got a quick way out 461 if (subrange.length == 0) return self; 462 NSMutableString *finalString = [NSMutableString stringWithString:self]; 463 do { 464 NSRange semiColonRange = NSMakeRange(subrange.location, NSMaxRange(range) - subrange.location); 465 semiColonRange = [self rangeOfString:@";" options:0 range:semiColonRange]; 466 range = NSMakeRange(0, subrange.location); 467 // if we don't find a semicolon in the range, we don't have a sequence 468 if (semiColonRange.location == NSNotFound) { 469 continue; 470 } 471 NSRange escapeRange = NSMakeRange(subrange.location, semiColonRange.location - subrange.location + 1); 472 NSString *escapeString = [self substringWithRange:escapeRange]; 473 NSUInteger length = [escapeString length]; 474 // a squence must be longer than 3 (<) and less than 11 (ϑ) 475 if (length > 3 && length < 11) { 476 if ([escapeString characterAtIndex:1] == '#') { 477 unichar char2 = [escapeString characterAtIndex:2]; 478 if (char2 == 'x' || char2 == 'X') { 479 // Hex escape squences £ 480 NSString *hexSequence = [escapeString substringWithRange:NSMakeRange(3, length - 4)]; 481 NSScanner *scanner = [NSScanner scannerWithString:hexSequence]; 482 unsigned value; 483 if ([scanner scanHexInt:&value] && 484 value < USHRT_MAX && 485 value > 0 486 && [scanner scanLocation] == length - 4) { 487 unichar uchar = (unichar)value; 488 NSString *charString = [NSString stringWithCharacters:&uchar length:1]; 489 [finalString replaceCharactersInRange:escapeRange withString:charString]; 490 } 491 492 } else { 493 // Decimal Sequences { 494 NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)]; 495 NSScanner *scanner = [NSScanner scannerWithString:numberSequence]; 496 int value; 497 if ([scanner scanInt:&value] && 498 value < USHRT_MAX && 499 value > 0 500 && [scanner scanLocation] == length - 3) { 501 unichar uchar = (unichar)value; 502 NSString *charString = [NSString stringWithCharacters:&uchar length:1]; 503 [finalString replaceCharactersInRange:escapeRange withString:charString]; 504 } 505 } 506 } else { 507 // "standard" sequences 508 for (unsigned i = 0; i < sizeof(gAsciiHTMLEscapeMap) / sizeof(HTMLEscapeMap); ++i) { 509 if ([escapeString isEqualToString:gAsciiHTMLEscapeMap[i].escapeSequence]) { 510 [finalString replaceCharactersInRange:escapeRange withString:[NSString stringWithCharacters:&gAsciiHTMLEscapeMap[i].uchar length:1]]; 511 break; 512 } 513 } 514 } 515 } 516 } while ((subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]).length != 0); 517 return finalString; 518} // gtm_stringByUnescapingHTML 519 520 521 522@end