PageRenderTime 46ms CodeModel.GetById 9ms RepoModel.GetById 0ms app.codeStats 0ms

/iulib/colib/iustring.h

https://code.google.com/p/ocropus/
C Header | 883 lines | 821 code | 21 blank | 41 comment | 140 complexity | 436a2ed358c9b0ebcc2ae4d8836c070d MD5 | raw file
Possible License(s): Apache-2.0
  1. // -*- C++ -*-
  2. // Copyright 2009 Deutsches Forschungszentrum fuer Kuenstliche Intelligenz
  3. // or its licensors, as applicable.
  4. //
  5. // You may not use this file except under the terms of the accompanying license.
  6. //
  7. // Licensed under the Apache License, Version 2.0 (the "License"); you
  8. // may not use this file except in compliance with the License. You may
  9. // obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS,
  13. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. // See the License for the specific language governing permissions and
  15. // limitations under the License.
  16. //
  17. // Project: iulib -- image understanding library
  18. // File: iustring.h
  19. // Purpose: iu string
  20. // Responsible: remat
  21. // Reviewer:
  22. // Primary Repository:
  23. // Web Sites: www.iupr.org, www.dfki.de
  24. #ifndef iustring_h__
  25. #define iustring_h__
  26. /// \file iustring.h
  27. /// \brief iu String
  28. #include <cstdio>
  29. #include <cstdarg>
  30. #include <limits.h>
  31. #include <regex.h>
  32. #include "narray.h"
  33. #include "nustring.h"
  34. namespace colib {
  35. class EncodedString : protected bytearray {
  36. protected:
  37. EncodedString() {}
  38. public:
  39. void clear() {
  40. return bytearray::clear();
  41. }
  42. int length() const {
  43. return bytearray::dim(0);
  44. }
  45. bool equal(EncodedString& other) {
  46. return bytearray::equal(other);
  47. }
  48. virtual void pushUnicode(const nuchar&) = 0;
  49. virtual nuchar getUnicode(int&) = 0;
  50. int fwrite(FILE *file) {
  51. return ::fwrite(data, 1, dim(0), file);
  52. }
  53. int fread(FILE *file) {
  54. unsigned char c;
  55. int i = 0;
  56. while(::fread(&c, 1, 1, file) == 1) {
  57. push(c);
  58. i++;
  59. }
  60. return i;
  61. }
  62. EncodedString& fgets(FILE* stream = stdin) {
  63. int c;
  64. while(((c = fgetc(stream)) != EOF) && (c != '\n')) {
  65. push(c);
  66. }
  67. return *this;
  68. }
  69. int fputs(FILE* stream = stdout) {
  70. for(int i=0; i<length(); i++) {
  71. if(fputc(at(i), stream) == EOF) {
  72. return EOF;
  73. }
  74. }
  75. if(fputc('\n', stream) == EOF) {
  76. return EOF;
  77. }
  78. return length() + 1;
  79. }
  80. void copyTo(char* result, int size) {
  81. if(length()>=size) throw "not enough room";
  82. for(int i=0; i<length(); i++) {
  83. result[i] = (*this)[i];
  84. }
  85. result[length()] = '\0';
  86. }
  87. const unsigned char* c_str() {
  88. return data;
  89. }
  90. };
  91. class utf8strg : public EncodedString {
  92. public:
  93. void pushUnicode(const nuchar& x) {
  94. unsigned int c = x.ord();
  95. // -- one byte --
  96. if(c < 128) {
  97. push(c);
  98. // -- two bytes --
  99. } else if(c < 2048) {
  100. push(0xC0 | (c >> 6));
  101. push(0x80 | (c & 0x3F));
  102. // -- three bytes --
  103. } else if(c < 65536) {
  104. push(0xE0 | (c >> 12));
  105. push(0x80 | ((c >> 6) & 0x3F));
  106. push(0x80 | (c & 0x3F));
  107. // -- four bytes --
  108. } else if(c < 2097152) {
  109. push(0xF0 | (c >> 18));
  110. push(0x80 | ((c >> 12) & 0x3F));
  111. push(0x80 | ((c >> 6) & 0x3F));
  112. push(0x80 | (c & 0x3F));
  113. } else {
  114. throw "UTF-8 encoding error";
  115. }
  116. }
  117. nuchar getUnicode(int& i) {
  118. return decode(*this, i);
  119. }
  120. template<class T>
  121. static nuchar decode(T& str, int& i) {
  122. unsigned int x = 0;
  123. int b = -1;
  124. while(b != 0) {
  125. unsigned char c = str[i++];
  126. // -- ASCII --
  127. if(c < 128) {
  128. x = c;
  129. b = 0;
  130. // -- not first byte --
  131. } else if(c < 0xC0) {
  132. if(b<=0) {
  133. throw "UTF-8 decoding error";
  134. }
  135. x += (c & 0x3F) << (6*(b-1));
  136. b--;
  137. // -- first of two bytes --
  138. } else if(c < 0xE0) {
  139. x = (c & 0x1F) << 6;
  140. b = 1;
  141. // -- first of three bytes --
  142. } else if(c < 0xF0) {
  143. x = (c & 0xF) << 12;
  144. b = 2;
  145. // -- first of four bytes --
  146. } else {
  147. x = (c & 0x7) << 18;
  148. b = 3;
  149. }
  150. }
  151. return nuchar(x);
  152. }
  153. };
  154. /**
  155. * @brief counted string class based on narray
  156. * implements most methods of std::string with same arguments
  157. */
  158. template<class T> class iustrg {
  159. public:
  160. iustrg() : len(0) {
  161. }
  162. iustrg(int n) : buf(n), len(0) {
  163. if(n > 0) {
  164. buf.at(0) = T('\0');
  165. }
  166. }
  167. template<class A>
  168. iustrg(const iustrg<A>& src) : len(0) {
  169. append(src);
  170. }
  171. iustrg(const char* src) : len(0) {
  172. append(src);
  173. }
  174. ~iustrg() {
  175. }
  176. int length() const {
  177. return len;
  178. }
  179. int size() const {
  180. return length();
  181. }
  182. int max_size() const {
  183. return INT_MAX;
  184. }
  185. void clear() {
  186. buf.dealloc();
  187. len = 0;
  188. }
  189. bool empty() const {
  190. return len == 0;
  191. }
  192. const T& operator[](int pos) const {
  193. return at(pos);
  194. }
  195. T& operator[](int pos) {
  196. return at(pos);
  197. }
  198. const T& operator()(int pos) const {
  199. return at(pos);
  200. }
  201. T& operator()(int pos) {
  202. return at(pos);
  203. }
  204. const T& at(int pos) const {
  205. if(pos < 0 || unsigned(pos) >= unsigned(len)) {
  206. throw "out of bounds";
  207. }
  208. return buf(pos);
  209. }
  210. T& at(int pos) {
  211. if(pos < 0 || unsigned(pos) >= unsigned(len)) {
  212. throw "out of bounds";
  213. }
  214. return buf(pos);
  215. }
  216. virtual iustrg<T>& append(const char* s, int pos, int n) {
  217. for(int i=pos; i<pos+n && s[i]!='\0'; i++) {
  218. push_back((s[i]));
  219. }
  220. return *this;
  221. }
  222. iustrg<T>& append(const char* s, int n) {
  223. return append(s, 0, n);
  224. }
  225. iustrg<T>& append(const char* s) {
  226. return append(s, 0, strlen(s));
  227. }
  228. template <class A>
  229. iustrg<T>& append(const iustrg<A>& str, int pos, int n) {
  230. for(int i=pos; i<pos+n && str[i]!='\0'; i++) {
  231. push_back(str[i]);
  232. }
  233. return *this;
  234. }
  235. template <class A>
  236. iustrg<T>& append(const iustrg<A>& str, int n) {
  237. return append(str, 0, n);
  238. }
  239. template <class A>
  240. iustrg<T>& append(const iustrg<A>& str) {
  241. return append(str, 0, str.length());
  242. }
  243. iustrg<T>& append(int n, T c) {
  244. for(int i=0; i<n; i++) {
  245. push_back(c);
  246. }
  247. }
  248. iustrg<T>& append(int x) {
  249. sprintf_append(*this, "%d", x);
  250. return *this;
  251. }
  252. iustrg<T>& append(long x) {
  253. sprintf_append(*this, "%ld", x);
  254. return *this;
  255. }
  256. iustrg<T>& append(double x) {
  257. sprintf_append(*this, "%f", x);
  258. return *this;
  259. }
  260. template <class A>
  261. iustrg<T>& operator+=(const A& s) {
  262. return append(s);
  263. }
  264. template <class A>
  265. void push_back(const A c) {
  266. buf.grow_to(len + 2); // +1 new char, +1 terminating \0
  267. buf.at(len) = T(c);
  268. buf.at(len+1) = T('\0');
  269. len++;
  270. }
  271. void push(const T& c) {
  272. push_back(c);
  273. }
  274. iustrg<T>& assign(const char *s, int pos, int n) {
  275. T *p = buf.data;
  276. buf.data = 0;
  277. clear();
  278. append(s, pos, n);
  279. if (p)
  280. delete[] p;
  281. return *this;
  282. }
  283. iustrg<T>& assign(const char* s, int n) {
  284. return assign(s, 0, n);
  285. }
  286. iustrg<T>& assign(const char* s) {
  287. if(s==0) s = "";
  288. return assign(s, 0, strlen(s));
  289. }
  290. iustrg<T>& assign(const iustrg<T>& str, int pos, int n) {
  291. T *p = buf.data;
  292. buf.data = 0;
  293. clear();
  294. append(str, pos, n);
  295. if (p)
  296. delete[] p;
  297. return *this;
  298. }
  299. iustrg<T>& assign(const iustrg<T>& str, int n) {
  300. return assign(str, 0, n);
  301. }
  302. iustrg<T>& assign(const iustrg<T>& str) {
  303. return assign(str, 0, str.length());
  304. }
  305. iustrg<T>& assign(int n, T c) {
  306. clear();
  307. return append(n, c);
  308. }
  309. template<class A>
  310. iustrg<T>& assign(const A& x) {
  311. clear();
  312. return append(x);
  313. }
  314. template<class A>
  315. iustrg<T>& operator=(const A& x) {
  316. return assign(x);
  317. }
  318. iustrg<T>& replace(int pos, int n1, const char* s, int n2) {
  319. iustrg<T> tmp;
  320. tmp.append(*this, pos);
  321. tmp.append(s, n2);
  322. tmp.append(*this, pos+n1, length() - pos - n1);
  323. return assign(tmp);
  324. }
  325. iustrg<T>& replace(int pos, int n1, const char* s) {
  326. return replace(pos, n1, s, strlen(s));
  327. }
  328. iustrg<T>& replace(int pos, int n1, const iustrg<T>& str, int n2) {
  329. iustrg<T> tmp;
  330. tmp.append(*this, pos);
  331. tmp.append(str, n2);
  332. tmp.append(*this, pos+n1, length() - pos - n1);
  333. return assign(tmp);
  334. }
  335. iustrg<T>& replace(int pos, int n1, const iustrg<T>& str) {
  336. return replace(pos, n1, str.buf, str.length());
  337. }
  338. iustrg<T>& replace(int pos, int n1, int n2, T c) {
  339. iustrg<T> tmp;
  340. tmp.append(*this, pos);
  341. tmp.append(n2, c);
  342. tmp.append(*this, pos+n1, length() - pos - n1);
  343. return assign(tmp);
  344. }
  345. iustrg<T>& insert(int pos, const char *s, int n) {
  346. return replace(pos, 0, s, n);
  347. }
  348. iustrg<T>& insert(int pos, const char *s) {
  349. return insert(pos, s, strlen(s));
  350. }
  351. iustrg<T>& insert(int pos, const iustrg<T>& str, int n) {
  352. return replace(pos, 0, str, n);
  353. }
  354. iustrg<T>& insert(int pos, const iustrg<T>& str) {
  355. return insert(pos, str, str.length());
  356. }
  357. iustrg<T>& insert(int pos, int n, T c) {
  358. return replace(pos, 0, n, c);
  359. }
  360. iustrg<T>& erase(int pos, int n) {
  361. return replace(pos, n, (const char*)0, 0);
  362. }
  363. iustrg<T>& erase(int pos) {
  364. return erase(pos, length()-pos);
  365. }
  366. int copy(char* dst, int n, int pos=0) const {
  367. int nCpy = 0;
  368. for(int i=pos; i<pos+n && i<len; i++) {
  369. dst[i] = (char)at(i);
  370. nCpy++;
  371. }
  372. return nCpy;
  373. }
  374. void copy_string(char* dst, int pos=0) const {
  375. for(int i=pos; i<len; i++) {
  376. dst[i] = (char)at(i);
  377. }
  378. dst[len] = '\0';
  379. }
  380. void swap(iustrg<T>& str) {
  381. narray<T> tmp;
  382. tmp.move(buf);
  383. buf.move(str.buf);
  384. str.buf.move(tmp);
  385. }
  386. int compare(int pos1, int n1, const iustrg<T>& str, int pos2, int n2) const {
  387. n1 = min(len-pos1, n1);
  388. n2 = min(str.length()-pos2, n2);
  389. for(int i=0; i<n1 && i<n2; i++) {
  390. if(at(pos1+i) > str[pos2+i]) {
  391. return 1;
  392. } else if(at(pos1+i) < str[pos2+i]) {
  393. return -1;
  394. }
  395. }
  396. if(n1 > n2) {
  397. return 1;
  398. } else if(n1 < n2) {
  399. return -1;
  400. } else {
  401. return 0;
  402. }
  403. }
  404. int compare(int pos1, int n1, const iustrg<T>& str) const {
  405. return compare(pos1, n1, str, 0, str.length());
  406. }
  407. int compare(const iustrg<T>& str) const {
  408. return compare(0, len, str, 0, str.length());
  409. }
  410. int compare(int pos1, int n1, const char* s, int pos2, int n2) const {
  411. n1 = min(len-pos1, n1);
  412. for(int i=0; i<n1 && i<n2; i++) {
  413. if(at(pos1+i) > s[pos2+i]) {
  414. return 1;
  415. } else if(at(pos1+i) < s[pos2+i]) {
  416. return -1;
  417. }
  418. }
  419. if(n1 > n2) {
  420. return 1;
  421. } else if(n1 < n2) {
  422. return -1;
  423. } else {
  424. return 0;
  425. }
  426. }
  427. int compare(int pos1, int n1, const char* s) const {
  428. return compare(pos1, n1, s, 0, strlen(s));
  429. }
  430. int compare(const char* s) const {
  431. return compare(0, len, s, 0, strlen(s));
  432. }
  433. bool operator==(const iustrg<T>& s) {
  434. return compare(s) == 0;
  435. }
  436. bool operator==(const char* s) {
  437. return compare(s) == 0;
  438. }
  439. bool operator!=(const iustrg<T>& s) {
  440. return compare(s) != 0;
  441. }
  442. bool operator!=(const char* s) {
  443. return compare(s) != 0;
  444. }
  445. iustrg<T> substr(int pos, int n) const {
  446. return iustrg<T>().append(*this, pos, n);
  447. }
  448. iustrg<T> substr(int pos) const {
  449. return substr(pos, len-pos);
  450. }
  451. const T* c_str() {
  452. return buf.data;
  453. }
  454. operator const T*() {
  455. return c_str();
  456. }
  457. operator bool() {
  458. return !empty();
  459. }
  460. narray<T>& data() const {
  461. return buf;
  462. }
  463. int find(const iustrg<T>& str, int pos=0) const {
  464. pos = limit(0, len-1, pos);
  465. for(int i=pos; i<=len-str.length(); i++) {
  466. if(compare(i, str.length(), str) == 0) {
  467. return i;
  468. }
  469. }
  470. return npos;
  471. }
  472. int find(const char* s, int pos, int n) const {
  473. pos = limit(0, len-1, pos);
  474. for(int i=pos; i<=len-n; i++) {
  475. if(compare(i, n, s, 0, n) == 0) {
  476. return i;
  477. }
  478. }
  479. return npos;
  480. }
  481. int find(const char* s, int pos=0) const {
  482. return find(s, pos, strlen(s));
  483. }
  484. int find(T c, int pos=0) const {
  485. pos = limit(0, len-1, pos);
  486. for(int i=pos; i<len; i++) {
  487. if(at(i) == c) {
  488. return i;
  489. }
  490. }
  491. return npos;
  492. }
  493. int rfind(const iustrg<T>& str, int pos=npos) const {
  494. if(pos < 0) {
  495. pos = len-1;
  496. } else {
  497. pos = limit(0, len-1, pos);
  498. }
  499. for(int i=pos; i>=0; i--) {
  500. if(compare(i, str.length(), str) == 0) {
  501. return i;
  502. }
  503. }
  504. return npos;
  505. }
  506. int rfind(const char* s, int pos, int n) const {
  507. if(pos < 0) {
  508. pos = len-1;
  509. } else {
  510. pos = limit(0, len-1, pos);
  511. }
  512. for(int i=pos; i>=0; i--) {
  513. if(compare(i, n, s, 0, n) == 0) {
  514. return i;
  515. }
  516. }
  517. return npos;
  518. }
  519. int rfind(const char* s, int pos=npos) const {
  520. return rfind(s, pos, strlen(s));
  521. }
  522. int rfind(T c, int pos=npos) const {
  523. if(pos < 0) {
  524. pos = len-1;
  525. } else {
  526. pos = limit(0, len-1, pos);
  527. }
  528. for(int i=pos; i>=0; i--) {
  529. if(at(i) == c) {
  530. return i;
  531. }
  532. }
  533. return npos;
  534. }
  535. static const int npos = -1;
  536. inline static int limit(int minV, int maxV, int value) {
  537. return max(minV, min(maxV, value));
  538. }
  539. narray<T>& getBuf() {
  540. return buf;
  541. }
  542. void utf8Encode(utf8strg& utf8) {
  543. utf8.clear();
  544. for(int i=0; i<length(); i++) {
  545. utf8.pushUnicode((*this)[i]);
  546. }
  547. }
  548. /// encodes the string with utf8 and adds a terminating '\0'
  549. void utf8EncodeTerm(utf8strg& utf8) {
  550. utf8Encode(utf8);
  551. utf8.pushUnicode(nuchar('\0'));
  552. }
  553. void utf8Decode(utf8strg& utf8) {
  554. clear();
  555. int i = 0;
  556. while(i<utf8.length()) {
  557. push_back(utf8.getUnicode(i));
  558. }
  559. }
  560. void utf8Encode(char *result, int size) {
  561. utf8strg utf8;
  562. utf8Encode(utf8);
  563. utf8.copyTo(result, size);
  564. }
  565. void utf8Decode(const char *s,int n) {
  566. utf8strg utf8;
  567. int i = 0;
  568. while(length() < n) {
  569. push(nuchar(utf8strg::decode(s, i)));
  570. }
  571. }
  572. int utf8Length() {
  573. utf8strg utf8;
  574. utf8Encode(utf8);
  575. return utf8.length();
  576. }
  577. protected:
  578. narray<T> buf; /// the actual characters
  579. int len; /// length of the string
  580. };
  581. typedef iustrg<char> strg;
  582. typedef iustrg<nuchar> ustrg;
  583. typedef ustrg nustring;
  584. typedef strg iucstring;
  585. template<class T>
  586. inline static iustrg<T> operator+(const iustrg<T>& s1, const iustrg<T>& s2) {
  587. iustrg<T> s;
  588. s.append(s1);
  589. s.append(s2);
  590. return s;
  591. }
  592. template<class T, class A>
  593. inline static iustrg<T> operator+(const iustrg<T>& s1, const A& s2) {
  594. iustrg<T> s;
  595. s.append(s1);
  596. s.append(s2);
  597. return s;
  598. }
  599. template<class T, class A>
  600. inline static iustrg<T> operator+(const A& s1, const iustrg<T>& s2) {
  601. iustrg<T> s;
  602. s.append(s1);
  603. s.append(s2);
  604. return s;
  605. }
  606. template<class T>
  607. inline static bool operator==(const iustrg<T>& s1, const iustrg<T>& s2) {
  608. return s1.compare(s2) == 0;
  609. }
  610. template<class T>
  611. inline static bool operator==(const iustrg<T>& s1, const char* s2) {
  612. return s1.compare(s2) == 0;
  613. }
  614. template<class T>
  615. inline static bool operator==(const char* s1, const iustrg<T>& s2) {
  616. return s2.compare(s1) == 0;
  617. }
  618. template<class T>
  619. inline static bool operator!=(const iustrg<T>& s1, const iustrg<T>& s2) {
  620. return !operator==(s1, s2);
  621. }
  622. template<class T>
  623. inline static bool operator!=(const iustrg<T>& s1, const char* s2) {
  624. return !operator==(s1, s2);
  625. }
  626. template<class T>
  627. inline static bool operator!=(const char* s1, const iustrg<T>& s2) {
  628. return !operator==(s1, s2);
  629. }
  630. template<class T>
  631. inline int sprintf(iustrg<T>& str, const char *format, ...) {
  632. int maxLen = 64;
  633. char* tmp = NULL;
  634. int result = 0;
  635. va_list va;
  636. do {
  637. maxLen *= 2;
  638. tmp = (char*)realloc(tmp, maxLen+1);
  639. va_start(va, format);
  640. result = vsnprintf(tmp, maxLen, format, va);
  641. va_end(va);
  642. } while(result >= maxLen);
  643. str.assign(tmp);
  644. free(tmp);
  645. return result;
  646. }
  647. template<class T>
  648. inline int sprintf_append(iustrg<T>& str, const char *format, ...) {
  649. int maxLen = 64;
  650. char* tmp = NULL;
  651. int result = 0;
  652. va_list va;
  653. do {
  654. maxLen *= 2;
  655. tmp = (char*)realloc(tmp, maxLen+1);
  656. va_start(va, format);
  657. result = vsnprintf(tmp, maxLen, format, va);
  658. va_end(va);
  659. } while(result >= maxLen);
  660. str.append(tmp);
  661. free(tmp);
  662. return result;
  663. }
  664. template<class T>
  665. inline int scanf(iustrg<T>& str, const char *format, ...) {
  666. const char* buf = str.c_str();
  667. va_list va;
  668. va_start(va, format);
  669. int result = vsscanf(buf, format, va);
  670. va_end(va);
  671. return result;
  672. }
  673. template<class T>
  674. inline iustrg<T>& fgets(iustrg<T>& str, FILE* stream = stdin) {
  675. int c;
  676. while(((c = fgetc(stream)) != EOF) && (c != '\n')) {
  677. str.push_back(c);
  678. }
  679. return str;
  680. }
  681. template<class T>
  682. inline int fputs(const iustrg<T>& str, FILE* stream = stdout) {
  683. for(int i=0; i<str.length(); i++) {
  684. if(fputc(str[i], stream) == EOF) {
  685. return EOF;
  686. }
  687. }
  688. if(fputc('\n', stream) == EOF) {
  689. return EOF;
  690. }
  691. return str.length() + 1;
  692. }
  693. template<class T>
  694. inline int read(iustrg<T>& str, int n, FILE* stream) {
  695. T c;
  696. int i = 0;
  697. while((i < n) && (fread(&c, sizeof(T), 1, stream) == 1)) {
  698. str.push_back(c);
  699. i++;
  700. }
  701. if(ferror(stream)) {
  702. i *= -1;
  703. }
  704. return i;
  705. }
  706. template<class T>
  707. inline int fread(iustrg<T>& str, FILE* stream) {
  708. return read(str, INT_MAX, stream);
  709. }
  710. template<class T>
  711. inline int write(iustrg<T>& str, int n, FILE* stream) {
  712. n = iustrg<T>::limit(0, str.length(), n);
  713. int i = 0;
  714. while((i < n) && (fwrite(&str[i], sizeof(T), 1, stream) == 1)) {
  715. i++;
  716. }
  717. return i;
  718. }
  719. template<class T>
  720. inline int fwrite(iustrg<T>& str, FILE* stream) {
  721. return write(str, str.length(), stream);
  722. }
  723. inline void re_compile(regex_t* regex, const char* pattern, int cflags=0, int eflags=0) {
  724. int error = regcomp(regex, pattern, cflags);
  725. if(error) {
  726. regfree(regex);
  727. char errMsg[256];
  728. regerror(error, regex, errMsg, 255);
  729. throw errMsg;
  730. }
  731. }
  732. inline int re_search(const strg& str, const char* pattern, int cflags=0, int eflags=0) {
  733. regex_t regex;
  734. re_compile(&regex, pattern, cflags, eflags);
  735. char* buf = (char*)malloc(str.length()+1);
  736. str.copy_string(buf);
  737. regmatch_t regmatch;
  738. int index = -1;
  739. int error;
  740. if((error = regexec(&regex, buf, 1, &regmatch, eflags)) == 0) {
  741. index = regmatch.rm_so;
  742. }
  743. regfree(&regex);
  744. free(buf);
  745. if(error == REG_ESPACE) {
  746. throw "out of memory";
  747. }
  748. return index;
  749. }
  750. inline int re_gsub(strg& str, const char* pattern, const char* sub, int n = -1, int cflags=0, int eflags=0) {
  751. regex_t regex;
  752. re_compile(&regex, pattern, cflags, eflags);
  753. const char* buf = str.c_str();
  754. strg result;
  755. regmatch_t regmatch;
  756. int s = 0;
  757. int nMatches = 0;
  758. int error = 0;
  759. while((n<0 || nMatches<n) && ((error = regexec(&regex, buf + s, 1, &regmatch, eflags)) == 0)) {
  760. result.append(str.substr(s, regmatch.rm_so));
  761. result.append(sub);
  762. eflags = REG_NOTBOL;
  763. s += regmatch.rm_eo;
  764. nMatches++;
  765. }
  766. result.append(str.substr(s));
  767. regfree(&regex);
  768. if(error == REG_ESPACE) {
  769. throw "out of memory";
  770. }
  771. str.assign(result);
  772. return nMatches;
  773. }
  774. inline int re_sub(strg& str, const char* pattern, const char* sub, int cflags=0, int eflags=0) {
  775. return re_gsub(str, pattern, sub, 1, cflags, eflags);
  776. }
  777. inline int fwrite(EncodedString& s, FILE *file) {
  778. return s.fwrite(file);
  779. }
  780. inline int fread(EncodedString& s, FILE *file) {
  781. return s.fread(file);
  782. }
  783. inline EncodedString& fgets(EncodedString& s, FILE *file) {
  784. return s.fgets(file);
  785. }
  786. inline int fputs(EncodedString& s, FILE *file) {
  787. return s.fputs(file);
  788. }
  789. inline int freadUTF8(iustrg<nuchar>& s, FILE* file) {
  790. utf8strg utf8;
  791. int r = fread(utf8, file);
  792. s.utf8Decode(utf8);
  793. return r;
  794. }
  795. inline iustrg<nuchar>& fgetsUTF8(iustrg<nuchar>& s, FILE *file) {
  796. utf8strg utf8;
  797. fgets(utf8, file);
  798. s.utf8Decode(utf8);
  799. return s;
  800. }
  801. inline int fwriteUTF8(iustrg<nuchar>& s, FILE* file) {
  802. utf8strg utf8;
  803. s.utf8Encode(utf8);
  804. return fwrite(utf8, file);
  805. }
  806. inline int fputsUTF8(iustrg<nuchar>& s, FILE* file) {
  807. utf8strg utf8;
  808. s.utf8Encode(utf8);
  809. return fputs(utf8, file);
  810. }
  811. #if 0
  812. class utf16strg : public EncodedString {
  813. public:
  814. void pushUnicode(unsigned int c) {
  815. // -- 2 bytes --
  816. if(c <= 0xFFFF) {
  817. push(c & 0xFF);
  818. push(c >> 8);
  819. // -- 4 bytes --
  820. } else {
  821. c -= 0x10000;
  822. push(0xDC00 | (c & 0x3FF));
  823. push(0xD800 | ((c >> 10) & 0x3FF));
  824. }
  825. }
  826. unsigned int getUnicode(int& i) const {
  827. return decode(*this, i);
  828. }
  829. template<class T>
  830. static unsigned int decode(T str, int& i) {
  831. unsigned int x = 0;
  832. unsigned int y = 0;
  833. int b = 0;
  834. do {
  835. unsigned char c = str[i++];
  836. if(b == 0) {
  837. x = c;
  838. } else if(b == 1) {
  839. x |= ((unsigned int)c) << 8;
  840. if(x < 0xD800 || x > 0xDFFF) {
  841. b = -1;
  842. } else if(x < 0xD800 || x > 0xDBFF) {
  843. throw "UTF-16 decoding error";
  844. }
  845. } else if(b == 2) {
  846. y = c;
  847. } else if(b == 3) {
  848. y |= ((unsigned int)c) << 8;
  849. if(y < 0xDC00 || y > 0xDFFF) {
  850. throw "UTF-16 decoding error";
  851. }
  852. x = (((x & 0x3FF) << 10) | (y & 0x3FF)) + 0x10000;
  853. b = -1;
  854. }
  855. b++;
  856. } while(b != 0);
  857. return x;
  858. }
  859. };
  860. #endif
  861. }
  862. #endif /* iustring_h__ */