PageRenderTime 57ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 1ms

/sstring.h

https://github.com/msGenDev/bowtie2
C Header | 3435 lines | 2138 code | 313 blank | 984 comment | 417 complexity | 37c2a47faa0c3ff5df42ad761e2f7c0a MD5 | raw file
Possible License(s): GPL-3.0

Large files files are truncated, but you can click here to view the full file

  1. /*
  2. * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
  3. *
  4. * This file is part of Bowtie 2.
  5. *
  6. * Bowtie 2 is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * Bowtie 2 is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #ifndef SSTRING_H_
  20. #define SSTRING_H_
  21. #include <string.h>
  22. #include <iostream>
  23. #include "assert_helpers.h"
  24. #include "alphabet.h"
  25. #include "random_source.h"
  26. /**
  27. * Four kinds of strings defined here:
  28. *
  29. * SString:
  30. * A fixed-length string using heap memory with size set at construction time
  31. * or when install() member is called.
  32. *
  33. * S2bDnaString:
  34. * Like SString, but stores a list uint32_t words where each word is divided
  35. * into 16 2-bit slots interpreted as holding one A/C/G/T nucleotide each.
  36. *
  37. * TODO: S3bDnaString allowing N. S4bDnaString allowing nucleotide masks.
  38. *
  39. * SStringExpandable:
  40. * A string using heap memory where the size of the backing store is
  41. * automatically resized as needed. Supports operations like append, insert,
  42. * erase, etc.
  43. *
  44. * SStringFixed:
  45. * A fixed-length string using stack memory where size is set at compile
  46. * time.
  47. *
  48. * All string classes have some extra facilities that make it easy to print the
  49. * string, including when the string uses an encoded alphabet. See toZBuf()
  50. * and toZBufXForm().
  51. *
  52. * Global lt, eq, and gt template functions are supplied. They are capable of
  53. * doing lexicographical comparisons between any of the three categories of
  54. * strings defined here.
  55. */
  56. template<typename T>
  57. class Class_sstr_len {
  58. public:
  59. static inline size_t sstr_len(const T& s) {
  60. return s.length();
  61. }
  62. };
  63. template<unsigned N>
  64. class Class_sstr_len<const char[N]> {
  65. public:
  66. static inline size_t sstr_len(const char s[N]) {
  67. return strlen(s);
  68. }
  69. };
  70. template<>
  71. class Class_sstr_len<const char *> {
  72. public:
  73. static inline size_t sstr_len(const char *s) {
  74. return strlen(s);
  75. }
  76. };
  77. template<>
  78. class Class_sstr_len<const unsigned char *> {
  79. public:
  80. static inline size_t sstr_len(const unsigned char *s) {
  81. return strlen((const char *)s);
  82. }
  83. };
  84. template<typename T1, typename T2>
  85. static inline bool sstr_eq(const T1& s1, const T2& s2) {
  86. size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
  87. size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
  88. if(len1 != len2) return false;
  89. for(size_t i = 0; i < len1; i++) {
  90. if(s1[i] != s2[i]) return false;
  91. }
  92. return true;
  93. }
  94. template<typename T1, typename T2>
  95. static inline bool sstr_neq(const T1& s1, const T2& s2) {
  96. return !sstr_eq(s1, s2);
  97. }
  98. /**
  99. * Return true iff the given suffix of s1 is equal to the given suffix of s2 up
  100. * to upto characters.
  101. */
  102. template<typename T1, typename T2>
  103. static inline bool sstr_suf_upto_eq(
  104. const T1& s1, size_t suf1,
  105. const T2& s2, size_t suf2,
  106. size_t upto,
  107. bool endlt = true)
  108. {
  109. assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
  110. assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
  111. size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
  112. size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
  113. if(len1 > upto) len1 = upto;
  114. if(len2 > upto) len2 = upto;
  115. if(len1 != len2) return false;
  116. for(size_t i = 0; i < len1; i++) {
  117. if(s1[suf1+i] != s2[suf2+i]) {
  118. return false;
  119. }
  120. }
  121. return true;
  122. }
  123. /**
  124. * Return true iff the given suffix of s1 is equal to the given suffix of s2 up
  125. * to upto characters.
  126. */
  127. template<typename T1, typename T2>
  128. static inline bool sstr_suf_upto_neq(
  129. const T1& s1, size_t suf1,
  130. const T2& s2, size_t suf2,
  131. size_t upto,
  132. bool endlt = true)
  133. {
  134. return !sstr_suf_upto_eq(s1, suf1, s2, suf2, upto, endlt);
  135. }
  136. /**
  137. * Return true iff s1 is less than s2.
  138. */
  139. template<typename T1, typename T2>
  140. static inline bool sstr_lt(const T1& s1, const T2& s2, bool endlt = true) {
  141. size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
  142. size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
  143. size_t minlen = (len1 < len2 ? len1 : len2);
  144. for(size_t i = 0; i < minlen; i++) {
  145. if(s1[i] < s2[i]) {
  146. return true;
  147. } else if(s1[i] > s2[i]) {
  148. return false;
  149. }
  150. }
  151. if(len1 == len2) return false;
  152. return (len1 < len2) == endlt;
  153. }
  154. /**
  155. * Return true iff the given suffix of s1 is less than the given suffix of s2.
  156. */
  157. template<typename T1, typename T2>
  158. static inline bool sstr_suf_lt(
  159. const T1& s1, size_t suf1,
  160. const T2& s2, size_t suf2,
  161. bool endlt = true)
  162. {
  163. assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
  164. assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
  165. size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
  166. size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
  167. size_t minlen = (len1 < len2 ? len1 : len2);
  168. for(size_t i = 0; i < minlen; i++) {
  169. if(s1[suf1+i] < s2[suf2+i]) {
  170. return true;
  171. } else if(s1[suf1+i] > s2[suf2+i]) {
  172. return false;
  173. }
  174. }
  175. if(len1 == len2) return false;
  176. return (len1 < len2) == endlt;
  177. }
  178. /**
  179. * Return true iff the given suffix of s1 is less than the given suffix of s2.
  180. * Treat s1 and s2 as though they have lengths len1/len2.
  181. */
  182. template<typename T1, typename T2>
  183. static inline bool sstr_suf_lt(
  184. const T1& s1, size_t suf1, size_t len1,
  185. const T2& s2, size_t suf2, size_t len2,
  186. bool endlt = true)
  187. {
  188. assert_leq(suf1, len1);
  189. assert_leq(suf2, len2);
  190. size_t left1 = len1 - suf1;
  191. size_t left2 = len2 - suf2;
  192. size_t minleft = (left1 < left2 ? left1 : left2);
  193. for(size_t i = 0; i < minleft; i++) {
  194. if(s1[suf1+i] < s2[suf2+i]) {
  195. return true;
  196. } else if(s1[suf1+i] > s2[suf2+i]) {
  197. return false;
  198. }
  199. }
  200. if(left1 == left2) return false;
  201. return (left1 < left2) == endlt;
  202. }
  203. /**
  204. * Return true iff the given suffix of s1 is less than the given suffix of s2
  205. * up to upto characters.
  206. */
  207. template<typename T1, typename T2>
  208. static inline bool sstr_suf_upto_lt(
  209. const T1& s1, size_t suf1,
  210. const T2& s2, size_t suf2,
  211. size_t upto,
  212. bool endlt = true)
  213. {
  214. assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
  215. assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
  216. size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
  217. size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
  218. if(len1 > upto) len1 = upto;
  219. if(len2 > upto) len2 = upto;
  220. size_t minlen = (len1 < len2 ? len1 : len2);
  221. for(size_t i = 0; i < minlen; i++) {
  222. if(s1[suf1+i] < s2[suf2+i]) {
  223. return true;
  224. } else if(s1[suf1+i] > s2[suf2+i]) {
  225. return false;
  226. }
  227. }
  228. if(len1 == len2) return false;
  229. return (len1 < len2) == endlt;
  230. }
  231. /**
  232. * Return true iff the given prefix of s1 is less than the given prefix of s2.
  233. */
  234. template<typename T1, typename T2>
  235. static inline bool sstr_pre_lt(
  236. const T1& s1, size_t pre1,
  237. const T2& s2, size_t pre2,
  238. bool endlt = true)
  239. {
  240. assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
  241. assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
  242. size_t len1 = pre1;
  243. size_t len2 = pre2;
  244. size_t minlen = (len1 < len2 ? len1 : len2);
  245. for(size_t i = 0; i < minlen; i++) {
  246. if(s1[i] < s2[i]) {
  247. return true;
  248. } else if(s1[i] > s2[i]) {
  249. return false;
  250. }
  251. }
  252. if(len1 == len2) return false;
  253. return (len1 < len2) == endlt;
  254. }
  255. /**
  256. * Return true iff s1 is less than or equal to s2.
  257. */
  258. template<typename T1, typename T2>
  259. static inline bool sstr_leq(const T1& s1, const T2& s2, bool endlt = true) {
  260. size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
  261. size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
  262. size_t minlen = (len1 < len2 ? len1 : len2);
  263. for(size_t i = 0; i < minlen; i++) {
  264. if(s1[i] < s2[i]) {
  265. return true;
  266. } else if(s1[i] > s2[i]) {
  267. return false;
  268. }
  269. }
  270. if(len1 == len2) return true;
  271. return (len1 < len2) == endlt;
  272. }
  273. /**
  274. * Return true iff the given suffix of s1 is less than or equal to the given
  275. * suffix of s2.
  276. */
  277. template<typename T1, typename T2>
  278. static inline bool sstr_suf_leq(
  279. const T1& s1, size_t suf1,
  280. const T2& s2, size_t suf2,
  281. bool endlt = true)
  282. {
  283. assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
  284. assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
  285. size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
  286. size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
  287. size_t minlen = (len1 < len2 ? len1 : len2);
  288. for(size_t i = 0; i < minlen; i++) {
  289. if(s1[suf1+i] < s2[suf2+i]) {
  290. return true;
  291. } else if(s1[suf1+i] > s2[suf2+i]) {
  292. return false;
  293. }
  294. }
  295. if(len1 == len2) return true;
  296. return (len1 < len2) == endlt;
  297. }
  298. /**
  299. * Return true iff the given prefix of s1 is less than or equal to the given
  300. * prefix of s2.
  301. */
  302. template<typename T1, typename T2>
  303. static inline bool sstr_pre_leq(
  304. const T1& s1, size_t pre1,
  305. const T2& s2, size_t pre2,
  306. bool endlt = true)
  307. {
  308. assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
  309. assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
  310. size_t len1 = pre1;
  311. size_t len2 = pre2;
  312. size_t minlen = (len1 < len2 ? len1 : len2);
  313. for(size_t i = 0; i < minlen; i++) {
  314. if(s1[i] < s2[i]) {
  315. return true;
  316. } else if(s1[i] > s2[i]) {
  317. return false;
  318. }
  319. }
  320. if(len1 == len2) return true;
  321. return (len1 < len2) == endlt;
  322. }
  323. /**
  324. * Return true iff s1 is greater than s2.
  325. */
  326. template<typename T1, typename T2>
  327. static inline bool sstr_gt(const T1& s1, const T2& s2, bool endlt = true) {
  328. size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
  329. size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
  330. size_t minlen = (len1 < len2 ? len1 : len2);
  331. for(size_t i = 0; i < minlen; i++) {
  332. if(s1[i] > s2[i]) {
  333. return true;
  334. } else if(s1[i] < s2[i]) {
  335. return false;
  336. }
  337. }
  338. if(len1 == len2) return false;
  339. return (len1 > len2) == endlt;
  340. }
  341. /**
  342. * Return true iff the given suffix of s1 is greater than the given suffix of
  343. * s2.
  344. */
  345. template<typename T1, typename T2>
  346. static inline bool sstr_suf_gt(
  347. const T1& s1, size_t suf1,
  348. const T2& s2, size_t suf2,
  349. bool endlt = true)
  350. {
  351. assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
  352. assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
  353. size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
  354. size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
  355. size_t minlen = (len1 < len2 ? len1 : len2);
  356. for(size_t i = 0; i < minlen; i++) {
  357. if(s1[suf1+i] > s2[suf2+i]) {
  358. return true;
  359. } else if(s1[suf1+i] < s2[suf2+i]) {
  360. return false;
  361. }
  362. }
  363. if(len1 == len2) return false;
  364. return (len1 > len2) == endlt;
  365. }
  366. /**
  367. * Return true iff the given prefix of s1 is greater than the given prefix of
  368. * s2.
  369. */
  370. template<typename T1, typename T2>
  371. static inline bool sstr_pre_gt(
  372. const T1& s1, size_t pre1,
  373. const T2& s2, size_t pre2,
  374. bool endlt = true)
  375. {
  376. assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
  377. assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
  378. size_t len1 = pre1;
  379. size_t len2 = pre2;
  380. size_t minlen = (len1 < len2 ? len1 : len2);
  381. for(size_t i = 0; i < minlen; i++) {
  382. if(s1[i] > s2[i]) {
  383. return true;
  384. } else if(s1[i] < s2[i]) {
  385. return false;
  386. }
  387. }
  388. if(len1 == len2) return false;
  389. return (len1 > len2) == endlt;
  390. }
  391. /**
  392. * Return true iff s1 is greater than or equal to s2.
  393. */
  394. template<typename T1, typename T2>
  395. static inline bool sstr_geq(const T1& s1, const T2& s2, bool endlt = true) {
  396. size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
  397. size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
  398. size_t minlen = (len1 < len2 ? len1 : len2);
  399. for(size_t i = 0; i < minlen; i++) {
  400. if(s1[i] > s2[i]) {
  401. return true;
  402. } else if(s1[i] < s2[i]) {
  403. return false;
  404. }
  405. }
  406. if(len1 == len2) return true;
  407. return (len1 > len2) == endlt;
  408. }
  409. /**
  410. * Return true iff the given suffix of s1 is greater than or equal to the given
  411. * suffix of s2.
  412. */
  413. template<typename T1, typename T2>
  414. static inline bool sstr_suf_geq(
  415. const T1& s1, size_t suf1,
  416. const T2& s2, size_t suf2,
  417. bool endlt = true)
  418. {
  419. assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
  420. assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
  421. size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
  422. size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
  423. size_t minlen = (len1 < len2 ? len1 : len2);
  424. for(size_t i = 0; i < minlen; i++) {
  425. if(s1[suf1+i] > s2[suf2+i]) {
  426. return true;
  427. } else if(s1[suf1+i] < s2[suf2+i]) {
  428. return false;
  429. }
  430. }
  431. if(len1 == len2) return true;
  432. return (len1 > len2) == endlt;
  433. }
  434. /**
  435. * Return true iff the given prefix of s1 is greater than or equal to the given
  436. * prefix of s2.
  437. */
  438. template<typename T1, typename T2>
  439. static inline bool sstr_pre_geq(
  440. const T1& s1, size_t pre1,
  441. const T2& s2, size_t pre2,
  442. bool endlt = true)
  443. {
  444. assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
  445. assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
  446. size_t len1 = pre1;
  447. size_t len2 = pre2;
  448. size_t minlen = (len1 < len2 ? len1 : len2);
  449. for(size_t i = 0; i < minlen; i++) {
  450. if(s1[i] > s2[i]) {
  451. return true;
  452. } else if(s1[i] < s2[i]) {
  453. return false;
  454. }
  455. }
  456. if(len1 == len2) return true;
  457. return (len1 > len2) == endlt;
  458. }
  459. template<typename T>
  460. static inline const char * sstr_to_cstr(const T& s) {
  461. return s.toZBuf();
  462. }
  463. template<>
  464. inline const char * sstr_to_cstr<std::basic_string<char> >(
  465. const std::basic_string<char>& s)
  466. {
  467. return s.c_str();
  468. }
  469. /**
  470. * Simple string class with backing memory whose size is managed by the user
  471. * using the constructor and install() member function. No behind-the-scenes
  472. * reallocation or copying takes place.
  473. */
  474. template<typename T>
  475. class SString {
  476. public:
  477. explicit SString() :
  478. cs_(NULL),
  479. printcs_(NULL),
  480. len_(0)
  481. { }
  482. explicit SString(size_t sz) :
  483. cs_(NULL),
  484. printcs_(NULL),
  485. len_(0)
  486. {
  487. resize(sz);
  488. }
  489. /**
  490. * Create an SStringExpandable from another SStringExpandable.
  491. */
  492. SString(const SString<T>& o) :
  493. cs_(NULL),
  494. printcs_(NULL),
  495. len_(0)
  496. {
  497. *this = o;
  498. }
  499. /**
  500. * Create an SStringExpandable from a std::basic_string of the
  501. * appropriate type.
  502. */
  503. explicit SString(const std::basic_string<T>& str) :
  504. cs_(NULL),
  505. printcs_(NULL),
  506. len_(0)
  507. {
  508. install(str.c_str(), str.length());
  509. }
  510. /**
  511. * Create an SStringExpandable from an array and size.
  512. */
  513. explicit SString(const T* b, size_t sz) :
  514. cs_(NULL),
  515. printcs_(NULL),
  516. len_(0)
  517. {
  518. install(b, sz);
  519. }
  520. /**
  521. * Create an SStringExpandable from a zero-terminated array.
  522. */
  523. explicit SString(const T* b) :
  524. cs_(NULL),
  525. printcs_(NULL),
  526. len_(0)
  527. {
  528. install(b, strlen(b));
  529. }
  530. /**
  531. * Destroy the expandable string object.
  532. */
  533. virtual ~SString() {
  534. if(cs_ != NULL) {
  535. delete[] cs_;
  536. cs_ = NULL;
  537. }
  538. if(printcs_ != NULL) {
  539. delete[] printcs_;
  540. printcs_ = NULL;
  541. }
  542. len_ = 0;
  543. }
  544. /**
  545. * Assignment to other SString.
  546. */
  547. SString<T>& operator=(const SString<T>& o) {
  548. install(o.cs_, o.len_);
  549. return *this;
  550. }
  551. /**
  552. * Assignment to other SString.
  553. */
  554. SString<T>& operator=(const std::basic_string<T>& o) {
  555. install(o);
  556. return *this;
  557. }
  558. /**
  559. * Resizes the string without preserving its contents.
  560. */
  561. void resize(size_t sz) {
  562. if(cs_ != NULL) {
  563. delete cs_;
  564. cs_ = NULL;
  565. }
  566. if(printcs_ != NULL) {
  567. delete printcs_;
  568. printcs_ = NULL;
  569. }
  570. if(sz != 0) {
  571. cs_ = new T[sz+1];
  572. }
  573. len_ = sz;
  574. }
  575. /**
  576. * Return ith character from the left of either the forward or the
  577. * reverse version of the read.
  578. */
  579. T windowGet(
  580. size_t i,
  581. bool fw,
  582. size_t depth = 0,
  583. size_t len = 0) const
  584. {
  585. if(len == 0) len = len_;
  586. assert_lt(i, len);
  587. assert_leq(len, len_ - depth);
  588. return fw ? cs_[depth+i] : cs_[depth+len-i-1];
  589. }
  590. /**
  591. * Return ith character from the left of either the forward or the
  592. * reverse-complement version of the read.
  593. */
  594. void windowGet(
  595. T& ret,
  596. bool fw,
  597. size_t depth = 0,
  598. size_t len = 0) const
  599. {
  600. if(len == 0) len = len_;
  601. assert_leq(len, len_ - depth);
  602. ret.resize(len);
  603. for(size_t i = 0; i < len; i++) {
  604. ret.set(fw ? cs_[depth+i] : cs_[depth+len-i-1], i);
  605. }
  606. }
  607. /**
  608. * Set character at index 'idx' to 'c'.
  609. */
  610. inline void set(int c, size_t idx) {
  611. assert_lt(idx, len_);
  612. cs_[idx] = c;
  613. }
  614. /**
  615. * Retrieve constant version of element i.
  616. */
  617. inline const T& operator[](size_t i) const {
  618. assert_lt(i, len_);
  619. return cs_[i];
  620. }
  621. /**
  622. * Retrieve mutable version of element i.
  623. */
  624. inline T& operator[](size_t i) {
  625. assert_lt(i, len_);
  626. return cs_[i];
  627. }
  628. /**
  629. * Retrieve constant version of element i.
  630. */
  631. inline const T& get(size_t i) const {
  632. assert_lt(i, len_);
  633. return cs_[i];
  634. }
  635. /**
  636. * Copy 'sz' bytes from buffer 'b' into this string. memcpy is used, not
  637. * operator=.
  638. */
  639. virtual void install(const T* b, size_t sz) {
  640. if(sz == 0) return;
  641. resize(sz);
  642. memcpy(cs_, b, sz * sizeof(T));
  643. }
  644. /**
  645. * Copy 'sz' bytes from buffer 'b' into this string. memcpy is used, not
  646. * operator=.
  647. */
  648. virtual void install(const std::basic_string<T>& b) {
  649. size_t sz = b.length();
  650. if(sz == 0) return;
  651. resize(sz);
  652. memcpy(cs_, b.c_str(), sz * sizeof(T));
  653. }
  654. /**
  655. * Copy all bytes from zero-terminated buffer 'b' into this string.
  656. */
  657. void install(const T* b) {
  658. install(b, strlen(b));
  659. }
  660. /**
  661. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  662. * in the process.
  663. */
  664. void installReverse(const char* b, size_t sz) {
  665. if(sz == 0) return;
  666. resize(sz);
  667. for(size_t i = 0; i < sz; i++) {
  668. cs_[i] = b[sz-i-1];
  669. }
  670. len_ = sz;
  671. }
  672. /**
  673. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  674. * in the process.
  675. */
  676. void installReverse(const SString<T>& b) {
  677. installReverse(b.cs_, b.len_);
  678. }
  679. /**
  680. * Return true iff the two strings are equal.
  681. */
  682. bool operator==(const SString<T>& o) {
  683. return sstr_eq(*this, o);
  684. }
  685. /**
  686. * Return true iff the two strings are not equal.
  687. */
  688. bool operator!=(const SString<T>& o) {
  689. return sstr_neq(*this, o);
  690. }
  691. /**
  692. * Return true iff this string is less than given string.
  693. */
  694. bool operator<(const SString<T>& o) {
  695. return sstr_lt(*this, o);
  696. }
  697. /**
  698. * Return true iff this string is greater than given string.
  699. */
  700. bool operator>(const SString<T>& o) {
  701. return sstr_gt(*this, o);
  702. }
  703. /**
  704. * Return true iff this string is less than or equal to given string.
  705. */
  706. bool operator<=(const SString<T>& o) {
  707. return sstr_leq(*this, o);
  708. }
  709. /**
  710. * Return true iff this string is greater than or equal to given string.
  711. */
  712. bool operator>=(const SString<T>& o) {
  713. return sstr_geq(*this, o);
  714. }
  715. /**
  716. * Reverse the buffer in place.
  717. */
  718. void reverse() {
  719. for(size_t i = 0; i < (len_ >> 1); i++) {
  720. T tmp = get(i);
  721. set(get(len_-i-1), i);
  722. set(tmp, len_-i-1);
  723. }
  724. }
  725. /**
  726. * Reverse a substring of the buffer in place.
  727. */
  728. void reverseWindow(size_t off, size_t len) {
  729. assert_leq(off, len_);
  730. assert_leq(off + len, len_);
  731. size_t mid = len >> 1;
  732. for(size_t i = 0; i < mid; i++) {
  733. T tmp = get(off+i);
  734. set(get(off+len-i-1), off+i);
  735. set(tmp, off+len-i-1);
  736. }
  737. }
  738. /**
  739. * Set the first len elements of the buffer to el.
  740. */
  741. void fill(size_t len, const T& el) {
  742. assert_leq(len, len_);
  743. for(size_t i = 0; i < len; i++) {
  744. set(el, i);
  745. }
  746. }
  747. /**
  748. * Set all elements of the buffer to el.
  749. */
  750. void fill(const T& el) {
  751. fill(len_, el);
  752. }
  753. /**
  754. * Return the length of the string.
  755. */
  756. inline size_t length() const { return len_; }
  757. /**
  758. * Clear the buffer.
  759. */
  760. void clear() { len_ = 0; }
  761. /**
  762. * Return true iff the buffer is empty.
  763. */
  764. inline bool empty() const { return len_ == 0; }
  765. /**
  766. * Put a terminator in the 'len_'th element and then return a
  767. * pointer to the buffer. Useful for printing.
  768. */
  769. const char* toZBufXForm(const char *xform) const {
  770. ASSERT_ONLY(size_t xformElts = strlen(xform));
  771. // Lazily allocate space for print buffer
  772. if(printcs_ == NULL) {
  773. const_cast<char*&>(printcs_) = new char[len_+1];
  774. }
  775. char* printcs = const_cast<char*>(printcs_);
  776. assert(printcs != NULL);
  777. for(size_t i = 0; i < len_; i++) {
  778. assert_lt(cs_[i], (int)xformElts);
  779. printcs[i] = xform[cs_[i]];
  780. }
  781. printcs[len_] = 0;
  782. return printcs_;
  783. }
  784. /**
  785. * Put a terminator in the 'len_'th element and then return a
  786. * pointer to the buffer. Useful for printing.
  787. */
  788. virtual const T* toZBuf() const {
  789. const_cast<T*>(cs_)[len_] = 0;
  790. return cs_;
  791. }
  792. /**
  793. * Return a const version of the raw buffer.
  794. */
  795. const T* buf() const { return cs_; }
  796. /**
  797. * Return a writeable version of the raw buffer.
  798. */
  799. T* wbuf() { return cs_; }
  800. protected:
  801. T *cs_; // +1 so that we have the option of dropping in a terminating "\0"
  802. char *printcs_; // +1 so that we have the option of dropping in a terminating "\0"
  803. size_t len_; // # elements
  804. };
  805. /**
  806. * Simple string class with backing memory whose size is managed by the user
  807. * using the constructor and install() member function. No behind-the-scenes
  808. * reallocation or copying takes place.
  809. */
  810. class S2bDnaString {
  811. public:
  812. explicit S2bDnaString() :
  813. cs_(NULL),
  814. printcs_(NULL),
  815. len_(0)
  816. { }
  817. explicit S2bDnaString(size_t sz) :
  818. cs_(NULL),
  819. printcs_(NULL),
  820. len_(0)
  821. {
  822. resize(sz);
  823. }
  824. /**
  825. * Copy another object of the same class.
  826. */
  827. S2bDnaString(const S2bDnaString& o) :
  828. cs_(NULL),
  829. printcs_(NULL),
  830. len_(0)
  831. {
  832. *this = o;
  833. }
  834. /**
  835. * Create an SStringExpandable from a std::basic_string of the
  836. * appropriate type.
  837. */
  838. explicit S2bDnaString(
  839. const std::basic_string<char>& str,
  840. bool chars = false,
  841. bool colors = false) :
  842. cs_(NULL),
  843. printcs_(NULL),
  844. len_(0)
  845. {
  846. if(chars) {
  847. if(colors) {
  848. installColors(str.c_str(), str.length());
  849. } else {
  850. installChars(str.c_str(), str.length());
  851. }
  852. } else {
  853. install(str.c_str(), str.length());
  854. }
  855. }
  856. /**
  857. * Create an SStringExpandable from an array and size.
  858. */
  859. explicit S2bDnaString(
  860. const char* b,
  861. size_t sz,
  862. bool chars = false,
  863. bool colors = false) :
  864. cs_(NULL),
  865. printcs_(NULL),
  866. len_(0)
  867. {
  868. if(chars) {
  869. if(colors) {
  870. installColors(b, sz);
  871. } else {
  872. installChars(b, sz);
  873. }
  874. } else {
  875. install(b, sz);
  876. }
  877. }
  878. /**
  879. * Create an SStringFixed from a zero-terminated string.
  880. */
  881. explicit S2bDnaString(
  882. const char* b,
  883. bool chars = false,
  884. bool colors = false) :
  885. cs_(NULL),
  886. printcs_(NULL),
  887. len_(0)
  888. {
  889. if(chars) {
  890. if(colors) {
  891. installColors(b, strlen(b));
  892. } else {
  893. installChars(b, strlen(b));
  894. }
  895. } else {
  896. install(b, strlen(b));
  897. }
  898. }
  899. /**
  900. * Destroy the expandable string object.
  901. */
  902. virtual ~S2bDnaString() {
  903. if(cs_ != NULL) {
  904. delete[] cs_;
  905. cs_ = NULL;
  906. }
  907. if(printcs_ != NULL) {
  908. delete[] printcs_;
  909. printcs_ = NULL;
  910. }
  911. len_ = 0;
  912. }
  913. /**
  914. * Assignment to other SString.
  915. */
  916. template<typename T>
  917. S2bDnaString& operator=(const T& o) {
  918. install(o.c_str(), o.length());
  919. return *this;
  920. }
  921. /**
  922. * Assignment from a std::basic_string
  923. */
  924. template<typename T>
  925. S2bDnaString& operator=(const std::basic_string<char>& o) {
  926. install(o);
  927. return *this;
  928. }
  929. /**
  930. * Resizes the string without preserving its contents.
  931. */
  932. void resize(size_t sz) {
  933. if(cs_ != NULL) {
  934. delete cs_;
  935. cs_ = NULL;
  936. }
  937. if(printcs_ != NULL) {
  938. delete printcs_;
  939. printcs_ = NULL;
  940. }
  941. len_ = sz;
  942. if(sz != 0) {
  943. cs_ = new uint32_t[nwords()];
  944. }
  945. }
  946. /**
  947. * Return DNA character corresponding to element 'idx'.
  948. */
  949. char toChar(size_t idx) const {
  950. int c = (int)get(idx);
  951. assert_range(0, 3, c);
  952. return "ACGT"[c];
  953. }
  954. /**
  955. * Return color character corresponding to element 'idx'.
  956. */
  957. char toColor(size_t idx) const {
  958. int c = (int)get(idx);
  959. assert_range(0, 3, c);
  960. return "0123"[c];
  961. }
  962. /**
  963. * Return ith character from the left of either the forward or the
  964. * reverse version of the read.
  965. */
  966. char windowGet(
  967. size_t i,
  968. bool fw,
  969. size_t depth = 0,
  970. size_t len = 0) const
  971. {
  972. if(len == 0) len = len_;
  973. assert_lt(i, len);
  974. assert_leq(len, len_ - depth);
  975. return fw ? get(depth+i) : get(depth+len-i-1);
  976. }
  977. /**
  978. * Return ith character from the left of either the forward or the
  979. * reverse-complement version of the read.
  980. */
  981. template<typename T>
  982. void windowGet(
  983. T& ret,
  984. bool fw,
  985. size_t depth = 0,
  986. size_t len = 0) const
  987. {
  988. if(len == 0) len = len_;
  989. assert_leq(len, len_ - depth);
  990. ret.resize(len);
  991. for(size_t i = 0; i < len; i++) {
  992. ret.set((fw ? get(depth+i) : get(depth+len-i-1)), i);
  993. }
  994. }
  995. /**
  996. * Return length in 32-bit words.
  997. */
  998. size_t nwords() const {
  999. return (len_ + 15) >> 4;
  1000. }
  1001. /**
  1002. * Set character at index 'idx' to 'c'.
  1003. */
  1004. void set(int c, size_t idx) {
  1005. assert_lt(idx, len_);
  1006. assert_range(0, 3, c);
  1007. size_t word = idx >> 4;
  1008. size_t bpoff = (idx & 15) << 1;
  1009. cs_[word] = cs_[word] & ~(uint32_t)(3 << bpoff);
  1010. cs_[word] = cs_[word] | (uint32_t)(c << bpoff);
  1011. }
  1012. /**
  1013. * Set character at index 'idx' to DNA char 'c'.
  1014. */
  1015. void setChar(int c, size_t idx) {
  1016. assert_in(toupper(c), "ACGT");
  1017. int bp = asc2dna[c];
  1018. set(bp, idx);
  1019. }
  1020. /**
  1021. * Set character at index 'idx' to color char 'c'.
  1022. */
  1023. void setColor(int c, size_t idx) {
  1024. assert_in(toupper(c), "0123");
  1025. int co = asc2col[c];
  1026. set(co, idx);
  1027. }
  1028. /**
  1029. * Set the ith 32-bit word to given word.
  1030. */
  1031. void setWord(uint32_t w, size_t i) {
  1032. assert_lt(i, nwords());
  1033. cs_[i] = w;
  1034. }
  1035. /**
  1036. * Retrieve constant version of element i.
  1037. */
  1038. char operator[](size_t i) const {
  1039. assert_lt(i, len_);
  1040. return get(i);
  1041. }
  1042. /**
  1043. * Retrieve constant version of element i.
  1044. */
  1045. char get(size_t i) const {
  1046. assert_lt(i, len_);
  1047. size_t word = i >> 4;
  1048. size_t bpoff = (i & 15) << 1;
  1049. return (char)((cs_[word] >> bpoff) & 3);
  1050. }
  1051. /**
  1052. * Copy packed words from string 'b' into this packed string.
  1053. */
  1054. void install(const uint32_t* b, size_t sz) {
  1055. if(sz == 0) return;
  1056. resize(sz);
  1057. memcpy(cs_, b, sizeof(uint32_t)*nwords());
  1058. }
  1059. /**
  1060. * Copy 'sz' DNA characters encoded as integers from buffer 'b' into this
  1061. * packed string.
  1062. */
  1063. void install(const char* b, size_t sz) {
  1064. if(sz == 0) return;
  1065. resize(sz);
  1066. size_t wordi = 0;
  1067. for(size_t i = 0; i < sz; i += 16) {
  1068. uint32_t word = 0;
  1069. for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
  1070. uint32_t bp = (int)b[i+j];
  1071. uint32_t shift = (uint32_t)j << 1;
  1072. assert_range(0, 3, (int)bp);
  1073. word |= (bp << shift);
  1074. }
  1075. cs_[wordi++] = word;
  1076. }
  1077. }
  1078. /**
  1079. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1080. */
  1081. void installChars(const char* b, size_t sz) {
  1082. if(sz == 0) return;
  1083. resize(sz);
  1084. size_t wordi = 0;
  1085. for(size_t i = 0; i < sz; i += 16) {
  1086. uint32_t word = 0;
  1087. for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
  1088. char c = b[i+j];
  1089. assert_in(toupper(c), "ACGT");
  1090. int bp = asc2dna[(int)c];
  1091. assert_range(0, 3, (int)bp);
  1092. uint32_t shift = (uint32_t)j << 1;
  1093. word |= (bp << shift);
  1094. }
  1095. cs_[wordi++] = word;
  1096. }
  1097. }
  1098. /**
  1099. * Copy 'sz' color characters from buffer 'b' into this packed string.
  1100. */
  1101. void installColors(const char* b, size_t sz) {
  1102. if(sz == 0) return;
  1103. resize(sz);
  1104. size_t wordi = 0;
  1105. for(size_t i = 0; i < sz; i += 16) {
  1106. uint32_t word = 0;
  1107. for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
  1108. char c = b[i+j];
  1109. assert_in(c, "0123");
  1110. int bp = asc2col[(int)c];
  1111. assert_range(0, 3, (int)bp);
  1112. uint32_t shift = (uint32_t)j << 1;
  1113. word |= (bp << shift);
  1114. }
  1115. cs_[wordi++] = word;
  1116. }
  1117. }
  1118. /**
  1119. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1120. */
  1121. void install(const char* b) {
  1122. install(b, strlen(b));
  1123. }
  1124. /**
  1125. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1126. */
  1127. void installChars(const char* b) {
  1128. installChars(b, strlen(b));
  1129. }
  1130. /**
  1131. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1132. */
  1133. void installColors(const char* b) {
  1134. installColors(b, strlen(b));
  1135. }
  1136. /**
  1137. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1138. */
  1139. void install(const std::basic_string<char>& b) {
  1140. install(b.c_str(), b.length());
  1141. }
  1142. /**
  1143. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1144. */
  1145. void installChars(const std::basic_string<char>& b) {
  1146. installChars(b.c_str(), b.length());
  1147. }
  1148. /**
  1149. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1150. */
  1151. void installColors(const std::basic_string<char>& b) {
  1152. installColors(b.c_str(), b.length());
  1153. }
  1154. /**
  1155. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  1156. * in the process.
  1157. */
  1158. void installReverse(const char* b, size_t sz) {
  1159. resize(sz);
  1160. if(sz == 0) return;
  1161. size_t wordi = 0;
  1162. size_t bpi = 0;
  1163. cs_[0] = 0;
  1164. for(size_t i =sz; i > 0; i--) {
  1165. assert_range(0, 3, (int)b[i-1]);
  1166. cs_[wordi] |= ((int)b[i-1] << (bpi<<1));
  1167. if(bpi == 15) {
  1168. wordi++;
  1169. cs_[wordi] = 0;
  1170. bpi = 0;
  1171. } else bpi++;
  1172. }
  1173. }
  1174. /**
  1175. * Copy all chars from buffer of DNA characters 'b' into this string,
  1176. * reversing them in the process.
  1177. */
  1178. void installReverse(const char* b) {
  1179. installReverse(b, strlen(b));
  1180. }
  1181. /**
  1182. * Copy 'sz' bytes from buffer of DNA characters 'b' into this string,
  1183. * reversing them in the process.
  1184. */
  1185. void installReverseChars(const char* b, size_t sz) {
  1186. resize(sz);
  1187. if(sz == 0) return;
  1188. size_t wordi = 0;
  1189. size_t bpi = 0;
  1190. cs_[0] = 0;
  1191. for(size_t i =sz; i > 0; i--) {
  1192. char c = b[i-1];
  1193. assert_in(toupper(c), "ACGT");
  1194. int bp = asc2dna[(int)c];
  1195. assert_range(0, 3, bp);
  1196. cs_[wordi] |= (bp << (bpi<<1));
  1197. if(bpi == 15) {
  1198. wordi++;
  1199. cs_[wordi] = 0;
  1200. bpi = 0;
  1201. } else bpi++;
  1202. }
  1203. }
  1204. /**
  1205. * Copy all chars from buffer of DNA characters 'b' into this string,
  1206. * reversing them in the process.
  1207. */
  1208. void installReverseChars(const char* b) {
  1209. installReverseChars(b, strlen(b));
  1210. }
  1211. /**
  1212. * Copy 'sz' bytes from buffer of color characters 'b' into this string,
  1213. * reversing them in the process.
  1214. */
  1215. void installReverseColors(const char* b, size_t sz) {
  1216. resize(sz);
  1217. if(sz == 0) return;
  1218. size_t wordi = 0;
  1219. size_t bpi = 0;
  1220. cs_[0] = 0;
  1221. for(size_t i =sz; i > 0; i--) {
  1222. char c = b[i-1];
  1223. assert_in(c, "0123");
  1224. int bp = asc2col[(int)c];
  1225. assert_range(0, 3, bp);
  1226. cs_[wordi] |= (bp << (bpi<<1));
  1227. if(bpi == 15) {
  1228. wordi++;
  1229. cs_[wordi] = 0;
  1230. bpi = 0;
  1231. } else bpi++;
  1232. }
  1233. }
  1234. /**
  1235. * Copy all chars from buffer of color characters 'b' into this string,
  1236. * reversing them in the process.
  1237. */
  1238. void installReverseColors(const char* b) {
  1239. installReverseColors(b, strlen(b));
  1240. }
  1241. /**
  1242. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  1243. * in the process.
  1244. */
  1245. void installReverse(const S2bDnaString& b) {
  1246. resize(b.len_);
  1247. if(b.len_ == 0) return;
  1248. size_t wordi = 0;
  1249. size_t bpi = 0;
  1250. size_t wordb = b.nwords()-1;
  1251. size_t bpb = (b.len_-1) & 15;
  1252. cs_[0] = 0;
  1253. for(size_t i = b.len_; i > 0; i--) {
  1254. int bbp = (int)((b[wordb] >> (bpb << 1)) & 3);
  1255. assert_range(0, 3, bbp);
  1256. cs_[wordi] |= (bbp << (bpi << 1));
  1257. if(bpi == 15) {
  1258. wordi++;
  1259. cs_[wordi] = 0;
  1260. bpi = 0;
  1261. } else bpi++;
  1262. if(bpb == 0) {
  1263. wordb--;
  1264. bpi = 15;
  1265. } else bpi--;
  1266. }
  1267. }
  1268. /**
  1269. * Return true iff the two strings are equal.
  1270. */
  1271. bool operator==(const S2bDnaString& o) {
  1272. return sstr_eq(*this, o);
  1273. }
  1274. /**
  1275. * Return true iff the two strings are not equal.
  1276. */
  1277. bool operator!=(const S2bDnaString& o) {
  1278. return sstr_neq(*this, o);
  1279. }
  1280. /**
  1281. * Return true iff this string is less than given string.
  1282. */
  1283. bool operator<(const S2bDnaString& o) {
  1284. return sstr_lt(*this, o);
  1285. }
  1286. /**
  1287. * Return true iff this string is greater than given string.
  1288. */
  1289. bool operator>(const S2bDnaString& o) {
  1290. return sstr_gt(*this, o);
  1291. }
  1292. /**
  1293. * Return true iff this string is less than or equal to given string.
  1294. */
  1295. bool operator<=(const S2bDnaString& o) {
  1296. return sstr_leq(*this, o);
  1297. }
  1298. /**
  1299. * Return true iff this string is greater than or equal to given string.
  1300. */
  1301. bool operator>=(const S2bDnaString& o) {
  1302. return sstr_geq(*this, o);
  1303. }
  1304. /**
  1305. * Reverse the 2-bit encoded DNA string in-place.
  1306. */
  1307. void reverse() {
  1308. if(len_ <= 1) return;
  1309. size_t wordf = nwords()-1;
  1310. size_t bpf = (len_-1) & 15;
  1311. size_t wordi = 0;
  1312. size_t bpi = 0;
  1313. while(wordf > wordi || (wordf == wordi && bpf > bpi)) {
  1314. int f = (cs_[wordf] >> (bpf << 1)) & 3;
  1315. int i = (cs_[wordi] >> (bpi << 1)) & 3;
  1316. cs_[wordf] &= ~(uint32_t)(3 << (bpf << 1));
  1317. cs_[wordi] &= ~(uint32_t)(3 << (bpi << 1));
  1318. cs_[wordf] |= (uint32_t)(i << (bpf << 1));
  1319. cs_[wordi] |= (uint32_t)(f << (bpi << 1));
  1320. if(bpf == 0) {
  1321. bpf = 15;
  1322. wordf--;
  1323. } else bpf--;
  1324. if(bpi == 15) {
  1325. bpi = 0;
  1326. wordi++;
  1327. } else bpi++;
  1328. }
  1329. }
  1330. /**
  1331. * Reverse a substring of the buffer in place.
  1332. */
  1333. void reverseWindow(size_t off, size_t len) {
  1334. assert_leq(off, len_);
  1335. assert_leq(off+len, len_);
  1336. if(len <= 1) return;
  1337. size_t wordf = (off+len-1) >> 4;
  1338. size_t bpf = (off+len-1) & 15;
  1339. size_t wordi = (off ) >> 4;
  1340. size_t bpi = (off ) & 15;
  1341. while(wordf > wordi || (wordf == wordi && bpf > bpi)) {
  1342. int f = (cs_[wordf] >> (bpf << 1)) & 3;
  1343. int i = (cs_[wordi] >> (bpi << 1)) & 3;
  1344. cs_[wordf] &= ~(uint32_t)(3 << (bpf << 1));
  1345. cs_[wordi] &= ~(uint32_t)(3 << (bpi << 1));
  1346. cs_[wordf] |= (uint32_t)(i << (bpf << 1));
  1347. cs_[wordi] |= (uint32_t)(f << (bpi << 1));
  1348. if(bpf == 0) {
  1349. bpf = 15;
  1350. wordf--;
  1351. } else bpf--;
  1352. if(bpi == 15) {
  1353. bpi = 0;
  1354. wordi++;
  1355. } else bpi++;
  1356. }
  1357. }
  1358. /**
  1359. * Set the first len elements of the buffer to el.
  1360. */
  1361. void fill(size_t len, char el) {
  1362. assert_leq(len, len_);
  1363. assert_range(0, 3, (int)el);
  1364. size_t word = 0;
  1365. if(len > 32) {
  1366. // Copy el throughout block
  1367. uint32_t bl = (uint32_t)el;
  1368. bl |= (bl << 2);
  1369. bl |= (bl << 4);
  1370. bl |= (bl << 8);
  1371. bl |= (bl << 16);
  1372. // Fill with blocks
  1373. size_t blen = len >> 4;
  1374. for(; word < blen; word++) {
  1375. cs_[word] = bl;
  1376. }
  1377. len = len & 15;
  1378. }
  1379. size_t bp = 0;
  1380. for(size_t i = 0; i < len; i++) {
  1381. cs_[word] &= ~(uint32_t)(3 << (bp << 1));
  1382. cs_[word] |= (uint32_t)(el << (bp << 1));
  1383. if(bp == 15) {
  1384. word++;
  1385. bp = 0;
  1386. } else bp++;
  1387. }
  1388. }
  1389. /**
  1390. * Set all elements of the buffer to el.
  1391. */
  1392. void fill(char el) {
  1393. fill(len_, el);
  1394. }
  1395. /**
  1396. * Return the ith character in the window defined by fw, color, depth and
  1397. * len.
  1398. */
  1399. char windowGetDna(
  1400. size_t i,
  1401. bool fw,
  1402. bool color,
  1403. size_t depth = 0,
  1404. size_t len = 0) const
  1405. {
  1406. if(len == 0) len = len_;
  1407. assert_lt(i, len);
  1408. assert_leq(len, len_ - depth);
  1409. if(fw) {
  1410. return get(depth+i);
  1411. } else {
  1412. return
  1413. color ?
  1414. get(depth+len-i-1) :
  1415. compDna(get(depth+len-i-1));
  1416. }
  1417. }
  1418. /**
  1419. * Fill the given DNA buffer with the substring specified by fw,
  1420. * color, depth and len.
  1421. */
  1422. template<typename T>
  1423. void windowGetDna(
  1424. T& buf,
  1425. bool fw,
  1426. bool color,
  1427. size_t depth = 0,
  1428. size_t len = 0) const
  1429. {
  1430. if(len == 0) len = len_;
  1431. assert_leq(len, len_ - depth);
  1432. buf.resize(len);
  1433. for(size_t i = 0; i < len; i++) {
  1434. buf.set(
  1435. (fw ?
  1436. get(depth+i) :
  1437. (color ?
  1438. get(depth+len-i-1) :
  1439. compDna(get(depth+len-i-1)))), i);
  1440. }
  1441. }
  1442. /**
  1443. * Return the length of the string.
  1444. */
  1445. inline size_t length() const { return len_; }
  1446. /**
  1447. * Clear the buffer.
  1448. */
  1449. void clear() { len_ = 0; }
  1450. /**
  1451. * Return true iff the buffer is empty.
  1452. */
  1453. inline bool empty() const { return len_ == 0; }
  1454. /**
  1455. * Return a const version of the raw buffer.
  1456. */
  1457. const uint32_t* buf() const { return cs_; }
  1458. /**
  1459. * Return a writeable version of the raw buffer.
  1460. */
  1461. uint32_t* wbuf() { return cs_; }
  1462. /**
  1463. * Note: the size of the string once it's stored in the print buffer is 4
  1464. * times as large as the string as stored in compact 2-bit-per-char words.
  1465. */
  1466. const char* toZBuf() const {
  1467. if(printcs_ == NULL) {
  1468. const_cast<char*&>(printcs_) = new char[len_+1];
  1469. }
  1470. char *printcs = const_cast<char*>(printcs_);
  1471. size_t word = 0, bp = 0;
  1472. for(size_t i = 0; i < len_; i++) {
  1473. int c = (cs_[word] >> (bp << 1)) & 3;
  1474. printcs[i] = "ACGT"[c];
  1475. if(bp == 15) {
  1476. word++;
  1477. bp = 0;
  1478. } else bp++;
  1479. }
  1480. printcs[len_] = '\0';
  1481. return printcs_;
  1482. }
  1483. protected:
  1484. uint32_t *cs_; // 2-bit packed words
  1485. char *printcs_;
  1486. size_t len_; // # elements
  1487. };
  1488. /**
  1489. * Simple string class with backing memory that automatically expands as needed.
  1490. */
  1491. template<typename T, int S = 1024, int M = 2>
  1492. class SStringExpandable {
  1493. public:
  1494. explicit SStringExpandable() :
  1495. cs_(NULL),
  1496. printcs_(NULL),
  1497. len_(0),
  1498. sz_(0)
  1499. { }
  1500. explicit SStringExpandable(size_t sz) :
  1501. cs_(NULL),
  1502. printcs_(NULL),
  1503. len_(0),
  1504. sz_(0)
  1505. {
  1506. expandNoCopy(sz);
  1507. }
  1508. /**
  1509. * Create an SStringExpandable from another SStringExpandable.
  1510. */
  1511. SStringExpandable(const SStringExpandable<T, S>& o) :
  1512. cs_(NULL),
  1513. printcs_(NULL),
  1514. len_(0),
  1515. sz_(0)
  1516. {
  1517. *this = o;
  1518. }
  1519. /**
  1520. * Create an SStringExpandable from a std::basic_string of the
  1521. * appropriate type.
  1522. */
  1523. explicit SStringExpandable(const std::basic_string<T>& str) :
  1524. cs_(NULL),
  1525. printcs_(NULL),
  1526. len_(0),
  1527. sz_(0)
  1528. {
  1529. install(str.c_str(), str.length());
  1530. }
  1531. /**
  1532. * Create an SStringExpandable from an array and size.
  1533. */
  1534. explicit SStringExpandable(const T* b, size_t sz) :
  1535. cs_(NULL),
  1536. printcs_(NULL),
  1537. len_(0),
  1538. sz_(0)
  1539. {
  1540. install(b, sz);
  1541. }
  1542. /**
  1543. * Create an SStringExpandable from a zero-terminated array.
  1544. */
  1545. explicit SStringExpandable(const T* b) :
  1546. cs_(NULL),
  1547. printcs_(NULL),
  1548. len_(0),
  1549. sz_(0)
  1550. {
  1551. install(b, strlen(b));
  1552. }
  1553. /**
  1554. * Destroy the expandable string object.
  1555. */
  1556. virtual ~SStringExpandable() {
  1557. if(cs_ != NULL) {
  1558. delete[] cs_;
  1559. cs_ = NULL;
  1560. }
  1561. if(printcs_ != NULL) {
  1562. delete[] printcs_;
  1563. printcs_ = NULL;
  1564. }
  1565. sz_ = len_ = 0;
  1566. }
  1567. /**
  1568. * Return ith character from the left of either the forward or the
  1569. * reverse-complement version of the read.
  1570. */
  1571. T windowGet(
  1572. size_t i,
  1573. bool fw,
  1574. size_t depth = 0,
  1575. size_t len = 0) const
  1576. {
  1577. if(len == 0) len = len_;
  1578. assert_lt(i, len);
  1579. assert_leq(len, len_ - depth);
  1580. return fw ? cs_[depth+i] : cs_[depth+len-i-1];
  1581. }
  1582. /**
  1583. * Return ith character from the left of either the forward or the
  1584. * reverse-complement version of the read.
  1585. */
  1586. void windowGet(
  1587. T& ret,
  1588. bool fw,
  1589. size_t depth = 0,
  1590. size_t len = 0) const
  1591. {
  1592. if(len == 0) len = len_;
  1593. assert_leq(len, len_ - depth);
  1594. for(size_t i = 0; i < len; i++) {
  1595. ret.append(fw ? cs_[depth+i] : cs_[depth+len-i-1]);
  1596. }
  1597. }
  1598. /**
  1599. * Assignment to other SStringFixed.
  1600. */
  1601. SStringExpandable<T,S>& operator=(const SStringExpandable<T,S>& o) {
  1602. install(o.cs_, o.len_);
  1603. return *this;
  1604. }
  1605. /**
  1606. * Assignment from a std::basic_string
  1607. */
  1608. SStringExpandable<T,S>& operator=(const std::basic_string<T>& o) {
  1609. install(o.c_str(), o.length());
  1610. return *this;
  1611. }
  1612. /**
  1613. * Insert char c before position 'idx'; slide subsequent chars down.
  1614. */
  1615. void insert(const T& c, size_t idx) {
  1616. assert_lt(idx, len_);
  1617. if(sz_ < len_ + 1) expandCopy((len_ + 1 + S) * M);
  1618. len_++;
  1619. // Move everyone down by 1
  1620. // len_ is the *new* length
  1621. for(size_t i = len_; i > idx+1; i--) {
  1622. cs_[i-1] = cs_[i-2];
  1623. }
  1624. cs_[idx] = c;
  1625. }
  1626. /**
  1627. * Set character at index 'idx' to 'c'.
  1628. */
  1629. void set(int c, size_t idx) {
  1630. assert_lt(idx, len_);
  1631. cs_[idx] = c;
  1632. }
  1633. /**
  1634. * Append char c.
  1635. */
  1636. void append(const T& c) {
  1637. if(sz_ < len_ + 1) expandCopy((len_ + 1 + S) * M);
  1638. cs_[len_++] = c;
  1639. }
  1640. /**
  1641. * Delete char at position 'idx'; slide subsequent chars up.
  1642. */
  1643. void remove(size_t idx) {
  1644. assert_lt(idx, len_);
  1645. assert_gt(len_, 0);
  1646. for(size_t i = idx; i < len_-1; i++) {
  1647. cs_[i] = cs_[i+1];
  1648. }
  1649. len_--;
  1650. }
  1651. /**
  1652. * Retrieve constant version of element i.
  1653. */
  1654. const T& operator[](size_t i) const {
  1655. assert_lt(i, len_);
  1656. return cs_[i];
  1657. }
  1658. /**
  1659. * Retrieve mutable version of element i.
  1660. */
  1661. T& operator[](size_t i) {
  1662. assert_lt(i, len_);
  1663. return cs_[i];
  1664. }
  1665. /**
  1666. * Retrieve constant version of element i.
  1667. */
  1668. const T& get(size_t i) const {
  1669. assert_lt(i, len_);
  1670. return cs_[i];
  1671. }
  1672. /**
  1673. * Copy 'sz' bytes from buffer 'b' into this string.
  1674. */
  1675. virtual void install(const T* b, size_t sz) {
  1676. if(sz_ < sz) expandNoCopy((sz + S) * M);
  1677. memcpy(cs_, b, sz * sizeof(T));
  1678. len_ = sz;
  1679. }
  1680. /**
  1681. * Copy all bytes from zero-terminated buffer 'b' into this string.
  1682. */
  1683. void install(const T* b) { install(b, strlen(b)); }
  1684. /**
  1685. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  1686. * in the process.
  1687. */
  1688. void installReverse(const char* b, size_t sz) {
  1689. if(sz_ < sz) expandNoCopy((sz + S) * M);
  1690. for(size_t i = 0; i < sz; i++) {
  1691. cs_[i] = b[sz-i-1];
  1692. }
  1693. len_ = sz;
  1694. }
  1695. /**
  1696. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  1697. * in the process.
  1698. */
  1699. void installReverse(const SStringExpandable<T, S>& b) {
  1700. if(sz_ < b.len_) expandNoCopy((b.len_ + S) * M);
  1701. for(size_t i = 0; i < b.len_; i++) {
  1702. cs_[i] = b.cs_[b.len_ - i - 1];
  1703. }
  1704. len_ = b.len_;
  1705. }
  1706. /**
  1707. * Return true iff the two strings are equal.
  1708. */
  1709. bool operator==(const SStringExpandable<T, S>& o) {
  1710. return sstr_eq(*this, o);
  1711. }
  1712. /**
  1713. * Return true iff the two strings are not equal.
  1714. */
  1715. bool operator!=(const SStringExpandable<T, S>& o) {
  1716. return sstr_neq(*this, o);
  1717. }
  1718. /**
  1719. * Return true iff this string is less than given string.
  1720. */
  1721. bool operator<(const SStringExpandable<T, S>& o) {
  1722. return sstr_lt(*this, o);
  1723. }
  1724. /**
  1725. * Return true iff this string is greater than given string.
  1726. */
  1727. bool operator>(const SStringExpandable<T, S>& o) {
  1728. return sstr_gt(*this, o);
  1729. }
  1730. /**
  1731. * Return true iff this string is less than or equal to given string.
  1732. */
  1733. bool operator<=(const SStringExpandable<T, S>& o) {
  1734. return sstr_leq(*this, o);
  1735. }
  1736. /**
  1737. * Return true iff this string is greater than or equal to given string.
  1738. */
  1739. bool operator>=(const SStringExpandable<T, S>& o) {
  1740. return sstr_geq(*this, o);
  1741. }
  1742. /**
  1743. * Reverse the buffer in place.
  1744. */
  1745. void reverse() {
  1746. for(size_t i = 0; i < (len_ >> 1); i++) {
  1747. T tmp = get(i);
  1748. set(get(len_-i-1), i);
  1749. set(tmp, len_-i-1);
  1750. }
  1751. }
  1752. /**
  1753. * Reverse a substring of the buffer in place.
  1754. */
  1755. void reverseWindow(size_t off, size_t len) {
  1756. assert_leq(off, len_);
  1757. assert_leq(off + len, len_);
  1758. size_t mid = len >> 1;
  1759. for(size_t i = 0; i < mid; i++) {
  1760. T tmp = get(off+i);
  1761. set(get(off+len-i-1), off+i);
  1762. set(tmp, off+len-i-1);
  1763. }
  1764. }
  1765. /**
  1766. * Simply resize the buffer. If the buffer is resized to be
  1767. * longer, the newly-added elements will contain garbage and should
  1768. * be initialized immediately.
  1769. */
  1770. void resize(size_t len) {
  1771. if(sz_ < len) expandCopy((len + S) * M);
  1772. len_ = len;
  1773. }
  1774. /**
  1775. * Simply resize the buffer. If the buffer is resized to be
  1776. * longer, new elements will be initialized with 'el'.
  1777. */
  1778. void resize(size_t len, const T& el) {
  1779. if(sz_ < len) expandCopy((len + S) * M);
  1780. if(len > len_) {
  1781. for(size_t i = len_; i < len; i++) {
  1782. cs_[i] = el;
  1783. }
  1784. }
  1785. len_ = len;
  1786. }
  1787. /**
  1788. * Set the first len elements of the buffer to el.
  1789. */
  1790. void fill(size_t len, const T& el) {
  1791. assert_leq(len, len_);
  1792. for(size_t i = 0; i < len; i++) {
  1793. cs_[i] = el;
  1794. }
  1795. }
  1796. /**
  1797. * Set all elements of the buffer to el.
  1798. */
  1799. void fill(const T& el) {
  1800. fill(len_, el);
  1801. }
  1802. /**
  1803. * Trim len characters from the beginning of the string.
  1804. */
  1805. void trimBegin(size_t len) {
  1806. assert_leq(len, len_);
  1807. if(len == len_) {
  1808. len_ = 0; return;
  1809. }
  1810. for(size_t i = 0; i < len_-len; i++) {
  1811. cs_[i] = cs_[i+len];
  1812. }
  1813. len_ -= len;
  1814. }
  1815. /**
  1816. * Trim len characters from the end of the string.
  1817. */
  1818. void trimEnd(size_t len) {
  1819. if(len >= len_) len_ = 0;
  1820. else len_ -= len;
  1821. }
  1822. /**
  1823. * Copy 'sz' bytes from buffer 'b' into this string.
  1824. */
  1825. void append(const T* b, size_t sz) {
  1826. if(sz_ < len_ + sz) expandCopy((len_ + sz + S) * M);
  1827. memcpy(cs_ + len_, b, sz * sizeof(T));
  1828. len_ += sz;
  1829. }
  1830. /**
  1831. * Copy bytes from zero-terminated buffer 'b' into this string.
  1832. */
  1833. void append(const T* b) {
  1834. append(b, strlen(b));
  1835. }
  1836. /**
  1837. * Return the length of the string.
  1838. */
  1839. size_t length() const { return len_; }
  1840. /**
  1841. * Clear the buffer.
  1842. */
  1843. void clear() { len_ = 0; }
  1844. /**
  1845. * Return true iff the buffer is empty.
  1846. */
  1847. bool empty() const { return len_ == 0; }
  1848. /**
  1849. * Put a terminator in the 'len_'th element and then return a
  1850. * pointer to the buffer. Useful for printing.
  1851. */
  1852. const char* toZBufXForm(const char *xform) const {
  1853. ASSERT_ONLY(size_t xformElts = strlen(xform));
  1854. if(empty()) {
  1855. const_cast<char&>(zero_) = 0;
  1856. return &zero_;
  1857. }
  1858. char* printcs = const_cast<char*>(printcs_);
  1859. // Lazily allocate space for print buffer
  1860. for(size_t i = 0; i < len_; i++) {
  1861. assert_lt(cs_[i], (int)xformElts);
  1862. printcs[i] = xform[(int)cs_[i]];
  1863. }
  1864. printcs[len_] = 0;
  1865. return printcs_;
  1866. }
  1867. /**
  1868. * Put a terminator in the 'len_'th element and then return a
  1869. * pointer to the buffer. Useful for printing.
  1870. */
  1871. virtual const T* toZBuf() const {
  1872. if(empty()) {
  1873. const_cast<T&>(zeroT_) = 0;
  1874. return &zeroT_;
  1875. }
  1876. assert_leq(len_, sz_);
  1877. const_cast<T*>(cs_)[len_] = 0;
  1878. return cs_;
  1879. }
  1880. /**
  1881. * Return true iff this DNA string matches the given nucleotide
  1882. * character string.
  1883. */
  1884. bool eq(const char *str) const {
  1885. const char *self = toZBuf();
  1886. return strcmp(str, self) == 0;
  1887. }
  1888. /**
  1889. * Return a const version of the raw buffer.
  1890. */
  1891. const T* buf() const { return cs_; }
  1892. /**
  1893. * Return a writeable version of the raw buffer.
  1894. */
  1895. T* wbuf() { return cs_; }
  1896. protected:
  1897. /**
  1898. * Allocate new, bigger buffer and copy old contents into it. If
  1899. * requested size can be accommodated by current buffer, do nothing.
  1900. */
  1901. void expandCopy(size_t sz) {
  1902. if(sz_ >= sz) return; // done!
  1903. T *tmp = new T[sz + 1];
  1904. char *ptmp = new char[sz + 1];
  1905. if(cs_ != NULL) {
  1906. memcpy(tmp, cs_, sizeof(T)*len_);
  1907. delete[] cs_;
  1908. }
  1909. if(printcs_ != NULL) {
  1910. memcpy(ptmp, printcs_, sizeof(char)*len_);
  1911. delete[] printcs_;
  1912. }
  1913. cs_ = tmp;
  1914. printcs_ = ptmp;
  1915. sz_ = sz;
  1916. }
  1917. /**
  1918. * Allocate new, bigger buffer. If requested size can be
  1919. * accommodated by current buffer, do nothing.
  1920. */
  1921. void expandNoCopy(size_t sz) {
  1922. if(sz_ >= sz) return; // done!
  1923. if(cs_ != NULL) delete[] cs_;
  1924. if(printcs_ != NULL) delete[] printcs_;
  1925. cs_ = new T[sz + 1];
  1926. printcs_ = new char[sz + 1];
  1927. sz_ = sz;
  1928. }
  1929. T *cs_; // +1 so that we have the option of dropping in a terminating "\0"
  1930. char *printcs_; // +1 so that we have the option of dropping in a terminating "\0"
  1931. char zero_; // 0 terminator for empty string
  1932. T zeroT_; // 0 terminator for empty string
  1933. size_t len_; // # filled-in elements
  1934. size_t sz_; // size capacity of cs_
  1935. };
  1936. /**
  1937. * Simple string class with in-object storage.
  1938. *
  1939. * All copies induced by, e.g., operator=, the copy constructor,
  1940. * install() and append(), are shallow (using memcpy/sizeof). If deep
  1941. * copies are needed, use a different class.
  1942. *
  1943. * Reading from an uninitialized element results in an assert as long
  1944. * as NDEBUG is not defined. If NDEBUG is defined, the result is
  1945. * undefined.
  1946. */
  1947. template<typename T, int S>
  1948. class SStringFixed {
  1949. public:
  1950. explicit SStringFixed() : len_(0) { }
  1951. /**
  1952. * Create an SStringFixed from another SStringFixed.
  1953. */
  1954. SStringFixed(const SStringFixed<T, S>& o) {
  1955. *this = o;
  1956. }
  1957. /**
  1958. * Create an SStringFixed from another SStringFixed.
  1959. */
  1960. explicit SStringFixed(const std::basic_string<T>& str) {
  1961. install(str.c_str(), str.length());
  1962. }
  1963. /**
  1964. * Create an SStringFixed from an array and size.
  1965. */
  1966. explicit SStringFixed(const T* b, size_t sz) {
  1967. install(b, sz);
  1968. }
  1969. /**
  1970. * Create an SStringFixed from a zero-terminated string.
  1971. */
  1972. explicit SStringFixed(const T* b) {
  1973. install(b, strlen(b));
  1974. }
  1975. virtual ~SStringFixed() { } // C++ needs this
  1976. /**
  1977. * Retrieve constant version of element i.
  1978. */
  1979. inline const T& operator[](size_t i) const {
  1980. return get(i);
  1981. }
  1982. /**
  1983. * Retrieve mutable version of element i.
  1984. */
  1985. inline T& operator[](size_t i) {
  1986. return get(i);
  1987. }
  1988. /**
  1989. * Retrieve constant version of element i.
  1990. */
  1991. inline const T& get(size_t i) const {
  1992. assert_lt(i, len_);
  1993. return cs_[i];
  1994. }
  1995. /**
  1996. * Retrieve mutable version of element i.
  1997. */
  1998. inline T& get(size_t i) {
  1999. assert_lt(i, len_);
  2000. return cs_[i];
  2001. }
  2002. /**
  2003. * Return ith character from the left of either the forward or the
  2004. * reverse-complement version of the read.
  2005. */
  2006. T windowGet(
  2007. size_t i,
  2008. bool fw,
  2009. size_t depth = 0,
  2010. size_t len = 0) const
  2011. {
  2012. if(len == 0) len = len_;
  2013. assert_lt(i, len);
  2014. assert_leq(len, len_ - depth);
  2015. return fw ? cs_[depth+i] : cs_[depth+len-i-1];
  2016. }
  2017. /**
  2018. * Return ith character from the left of either the forward or the
  2019. * reverse-complement version of the read.
  2020. */
  2021. void windowGet(
  2022. T& ret,
  2023. bool fw,
  2024. size_t depth = 0,
  2025. size_t len = 0) const
  2026. {
  2027. if(len == 0) len = len_;
  2028. assert_leq(len, len_ - depth);
  2029. for(size_t i = 0; i < len; i++) {
  2030. ret.append(fw ? cs_[depth+i] : cs_[depth+len-i-1]);
  2031. }
  2032. }
  2033. /**
  2034. * Assignment to other SStringFixed.
  2035. */
  2036. SStringFixed<T,S>& operator=(const SStringFixed<T,S>& o) {
  2037. install(o.cs_, o.len_);
  2038. return *this;
  2039. }
  2040. /**
  2041. * Assignment from a std::basic_string
  2042. */
  2043. SStringFixed<T,S>& operator=(const std::basic_string<T>& o) {
  2044. install(o);
  2045. return *this;
  2046. }
  2047. /**
  2048. * Insert char c before position 'idx'; slide subsequent chars down.
  2049. */
  2050. void insert(const T& c, size_t idx) {
  2051. assert_lt(len_, S);
  2052. assert_lt(idx, len_);
  2053. // Move everyone down by 1
  2054. for(int i = len_; i > idx; i--) {
  2055. cs_[i] = cs_[i-1];
  2056. }
  2057. cs_[idx] = c;
  2058. len_++;
  2059. }
  2060. /**
  2061. * Set character at index 'idx' to 'c'.
  2062. */
  2063. void set(int c, size_t idx) {
  2064. assert_lt(idx, len_);
  2065. cs_[idx] = c;
  2066. }
  2067. /**
  2068. * Append char c.
  2069. */
  2070. void append(const T& c) {
  2071. assert_lt(len_, S);
  2072. cs_[len_++] = c;
  2073. }
  2074. /**
  2075. * Delete char at position 'idx'; slide subsequent chars up.
  2076. */
  2077. void remove(size_t idx) {
  2078. assert_lt(idx, len_);
  2079. assert_gt(len_, 0);
  2080. for(size_t i = idx; i < len_-1; i++) {
  2081. cs_[i] = cs_

Large files files are truncated, but you can click here to view the full file