PageRenderTime 62ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/sstring.h

https://github.com/msGenDev/bowtie2
C Header | 3435 lines | 2138 code | 313 blank | 984 comment | 417 complexity | 37c2a47faa0c3ff5df42ad761e2f7c0a MD5 | raw file
Possible License(s): GPL-3.0
  1. /*
  2. * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
  3. *
  4. * This file is part of Bowtie 2.
  5. *
  6. * Bowtie 2 is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * Bowtie 2 is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #ifndef SSTRING_H_
  20. #define SSTRING_H_
  21. #include <string.h>
  22. #include <iostream>
  23. #include "assert_helpers.h"
  24. #include "alphabet.h"
  25. #include "random_source.h"
  26. /**
  27. * Four kinds of strings defined here:
  28. *
  29. * SString:
  30. * A fixed-length string using heap memory with size set at construction time
  31. * or when install() member is called.
  32. *
  33. * S2bDnaString:
  34. * Like SString, but stores a list uint32_t words where each word is divided
  35. * into 16 2-bit slots interpreted as holding one A/C/G/T nucleotide each.
  36. *
  37. * TODO: S3bDnaString allowing N. S4bDnaString allowing nucleotide masks.
  38. *
  39. * SStringExpandable:
  40. * A string using heap memory where the size of the backing store is
  41. * automatically resized as needed. Supports operations like append, insert,
  42. * erase, etc.
  43. *
  44. * SStringFixed:
  45. * A fixed-length string using stack memory where size is set at compile
  46. * time.
  47. *
  48. * All string classes have some extra facilities that make it easy to print the
  49. * string, including when the string uses an encoded alphabet. See toZBuf()
  50. * and toZBufXForm().
  51. *
  52. * Global lt, eq, and gt template functions are supplied. They are capable of
  53. * doing lexicographical comparisons between any of the three categories of
  54. * strings defined here.
  55. */
  56. template<typename T>
  57. class Class_sstr_len {
  58. public:
  59. static inline size_t sstr_len(const T& s) {
  60. return s.length();
  61. }
  62. };
  63. template<unsigned N>
  64. class Class_sstr_len<const char[N]> {
  65. public:
  66. static inline size_t sstr_len(const char s[N]) {
  67. return strlen(s);
  68. }
  69. };
  70. template<>
  71. class Class_sstr_len<const char *> {
  72. public:
  73. static inline size_t sstr_len(const char *s) {
  74. return strlen(s);
  75. }
  76. };
  77. template<>
  78. class Class_sstr_len<const unsigned char *> {
  79. public:
  80. static inline size_t sstr_len(const unsigned char *s) {
  81. return strlen((const char *)s);
  82. }
  83. };
  84. template<typename T1, typename T2>
  85. static inline bool sstr_eq(const T1& s1, const T2& s2) {
  86. size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
  87. size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
  88. if(len1 != len2) return false;
  89. for(size_t i = 0; i < len1; i++) {
  90. if(s1[i] != s2[i]) return false;
  91. }
  92. return true;
  93. }
  94. template<typename T1, typename T2>
  95. static inline bool sstr_neq(const T1& s1, const T2& s2) {
  96. return !sstr_eq(s1, s2);
  97. }
  98. /**
  99. * Return true iff the given suffix of s1 is equal to the given suffix of s2 up
  100. * to upto characters.
  101. */
  102. template<typename T1, typename T2>
  103. static inline bool sstr_suf_upto_eq(
  104. const T1& s1, size_t suf1,
  105. const T2& s2, size_t suf2,
  106. size_t upto,
  107. bool endlt = true)
  108. {
  109. assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
  110. assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
  111. size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
  112. size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
  113. if(len1 > upto) len1 = upto;
  114. if(len2 > upto) len2 = upto;
  115. if(len1 != len2) return false;
  116. for(size_t i = 0; i < len1; i++) {
  117. if(s1[suf1+i] != s2[suf2+i]) {
  118. return false;
  119. }
  120. }
  121. return true;
  122. }
  123. /**
  124. * Return true iff the given suffix of s1 is equal to the given suffix of s2 up
  125. * to upto characters.
  126. */
  127. template<typename T1, typename T2>
  128. static inline bool sstr_suf_upto_neq(
  129. const T1& s1, size_t suf1,
  130. const T2& s2, size_t suf2,
  131. size_t upto,
  132. bool endlt = true)
  133. {
  134. return !sstr_suf_upto_eq(s1, suf1, s2, suf2, upto, endlt);
  135. }
  136. /**
  137. * Return true iff s1 is less than s2.
  138. */
  139. template<typename T1, typename T2>
  140. static inline bool sstr_lt(const T1& s1, const T2& s2, bool endlt = true) {
  141. size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
  142. size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
  143. size_t minlen = (len1 < len2 ? len1 : len2);
  144. for(size_t i = 0; i < minlen; i++) {
  145. if(s1[i] < s2[i]) {
  146. return true;
  147. } else if(s1[i] > s2[i]) {
  148. return false;
  149. }
  150. }
  151. if(len1 == len2) return false;
  152. return (len1 < len2) == endlt;
  153. }
  154. /**
  155. * Return true iff the given suffix of s1 is less than the given suffix of s2.
  156. */
  157. template<typename T1, typename T2>
  158. static inline bool sstr_suf_lt(
  159. const T1& s1, size_t suf1,
  160. const T2& s2, size_t suf2,
  161. bool endlt = true)
  162. {
  163. assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
  164. assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
  165. size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
  166. size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
  167. size_t minlen = (len1 < len2 ? len1 : len2);
  168. for(size_t i = 0; i < minlen; i++) {
  169. if(s1[suf1+i] < s2[suf2+i]) {
  170. return true;
  171. } else if(s1[suf1+i] > s2[suf2+i]) {
  172. return false;
  173. }
  174. }
  175. if(len1 == len2) return false;
  176. return (len1 < len2) == endlt;
  177. }
  178. /**
  179. * Return true iff the given suffix of s1 is less than the given suffix of s2.
  180. * Treat s1 and s2 as though they have lengths len1/len2.
  181. */
  182. template<typename T1, typename T2>
  183. static inline bool sstr_suf_lt(
  184. const T1& s1, size_t suf1, size_t len1,
  185. const T2& s2, size_t suf2, size_t len2,
  186. bool endlt = true)
  187. {
  188. assert_leq(suf1, len1);
  189. assert_leq(suf2, len2);
  190. size_t left1 = len1 - suf1;
  191. size_t left2 = len2 - suf2;
  192. size_t minleft = (left1 < left2 ? left1 : left2);
  193. for(size_t i = 0; i < minleft; i++) {
  194. if(s1[suf1+i] < s2[suf2+i]) {
  195. return true;
  196. } else if(s1[suf1+i] > s2[suf2+i]) {
  197. return false;
  198. }
  199. }
  200. if(left1 == left2) return false;
  201. return (left1 < left2) == endlt;
  202. }
  203. /**
  204. * Return true iff the given suffix of s1 is less than the given suffix of s2
  205. * up to upto characters.
  206. */
  207. template<typename T1, typename T2>
  208. static inline bool sstr_suf_upto_lt(
  209. const T1& s1, size_t suf1,
  210. const T2& s2, size_t suf2,
  211. size_t upto,
  212. bool endlt = true)
  213. {
  214. assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
  215. assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
  216. size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
  217. size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
  218. if(len1 > upto) len1 = upto;
  219. if(len2 > upto) len2 = upto;
  220. size_t minlen = (len1 < len2 ? len1 : len2);
  221. for(size_t i = 0; i < minlen; i++) {
  222. if(s1[suf1+i] < s2[suf2+i]) {
  223. return true;
  224. } else if(s1[suf1+i] > s2[suf2+i]) {
  225. return false;
  226. }
  227. }
  228. if(len1 == len2) return false;
  229. return (len1 < len2) == endlt;
  230. }
  231. /**
  232. * Return true iff the given prefix of s1 is less than the given prefix of s2.
  233. */
  234. template<typename T1, typename T2>
  235. static inline bool sstr_pre_lt(
  236. const T1& s1, size_t pre1,
  237. const T2& s2, size_t pre2,
  238. bool endlt = true)
  239. {
  240. assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
  241. assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
  242. size_t len1 = pre1;
  243. size_t len2 = pre2;
  244. size_t minlen = (len1 < len2 ? len1 : len2);
  245. for(size_t i = 0; i < minlen; i++) {
  246. if(s1[i] < s2[i]) {
  247. return true;
  248. } else if(s1[i] > s2[i]) {
  249. return false;
  250. }
  251. }
  252. if(len1 == len2) return false;
  253. return (len1 < len2) == endlt;
  254. }
  255. /**
  256. * Return true iff s1 is less than or equal to s2.
  257. */
  258. template<typename T1, typename T2>
  259. static inline bool sstr_leq(const T1& s1, const T2& s2, bool endlt = true) {
  260. size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
  261. size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
  262. size_t minlen = (len1 < len2 ? len1 : len2);
  263. for(size_t i = 0; i < minlen; i++) {
  264. if(s1[i] < s2[i]) {
  265. return true;
  266. } else if(s1[i] > s2[i]) {
  267. return false;
  268. }
  269. }
  270. if(len1 == len2) return true;
  271. return (len1 < len2) == endlt;
  272. }
  273. /**
  274. * Return true iff the given suffix of s1 is less than or equal to the given
  275. * suffix of s2.
  276. */
  277. template<typename T1, typename T2>
  278. static inline bool sstr_suf_leq(
  279. const T1& s1, size_t suf1,
  280. const T2& s2, size_t suf2,
  281. bool endlt = true)
  282. {
  283. assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
  284. assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
  285. size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
  286. size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
  287. size_t minlen = (len1 < len2 ? len1 : len2);
  288. for(size_t i = 0; i < minlen; i++) {
  289. if(s1[suf1+i] < s2[suf2+i]) {
  290. return true;
  291. } else if(s1[suf1+i] > s2[suf2+i]) {
  292. return false;
  293. }
  294. }
  295. if(len1 == len2) return true;
  296. return (len1 < len2) == endlt;
  297. }
  298. /**
  299. * Return true iff the given prefix of s1 is less than or equal to the given
  300. * prefix of s2.
  301. */
  302. template<typename T1, typename T2>
  303. static inline bool sstr_pre_leq(
  304. const T1& s1, size_t pre1,
  305. const T2& s2, size_t pre2,
  306. bool endlt = true)
  307. {
  308. assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
  309. assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
  310. size_t len1 = pre1;
  311. size_t len2 = pre2;
  312. size_t minlen = (len1 < len2 ? len1 : len2);
  313. for(size_t i = 0; i < minlen; i++) {
  314. if(s1[i] < s2[i]) {
  315. return true;
  316. } else if(s1[i] > s2[i]) {
  317. return false;
  318. }
  319. }
  320. if(len1 == len2) return true;
  321. return (len1 < len2) == endlt;
  322. }
  323. /**
  324. * Return true iff s1 is greater than s2.
  325. */
  326. template<typename T1, typename T2>
  327. static inline bool sstr_gt(const T1& s1, const T2& s2, bool endlt = true) {
  328. size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
  329. size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
  330. size_t minlen = (len1 < len2 ? len1 : len2);
  331. for(size_t i = 0; i < minlen; i++) {
  332. if(s1[i] > s2[i]) {
  333. return true;
  334. } else if(s1[i] < s2[i]) {
  335. return false;
  336. }
  337. }
  338. if(len1 == len2) return false;
  339. return (len1 > len2) == endlt;
  340. }
  341. /**
  342. * Return true iff the given suffix of s1 is greater than the given suffix of
  343. * s2.
  344. */
  345. template<typename T1, typename T2>
  346. static inline bool sstr_suf_gt(
  347. const T1& s1, size_t suf1,
  348. const T2& s2, size_t suf2,
  349. bool endlt = true)
  350. {
  351. assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
  352. assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
  353. size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
  354. size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
  355. size_t minlen = (len1 < len2 ? len1 : len2);
  356. for(size_t i = 0; i < minlen; i++) {
  357. if(s1[suf1+i] > s2[suf2+i]) {
  358. return true;
  359. } else if(s1[suf1+i] < s2[suf2+i]) {
  360. return false;
  361. }
  362. }
  363. if(len1 == len2) return false;
  364. return (len1 > len2) == endlt;
  365. }
  366. /**
  367. * Return true iff the given prefix of s1 is greater than the given prefix of
  368. * s2.
  369. */
  370. template<typename T1, typename T2>
  371. static inline bool sstr_pre_gt(
  372. const T1& s1, size_t pre1,
  373. const T2& s2, size_t pre2,
  374. bool endlt = true)
  375. {
  376. assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
  377. assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
  378. size_t len1 = pre1;
  379. size_t len2 = pre2;
  380. size_t minlen = (len1 < len2 ? len1 : len2);
  381. for(size_t i = 0; i < minlen; i++) {
  382. if(s1[i] > s2[i]) {
  383. return true;
  384. } else if(s1[i] < s2[i]) {
  385. return false;
  386. }
  387. }
  388. if(len1 == len2) return false;
  389. return (len1 > len2) == endlt;
  390. }
  391. /**
  392. * Return true iff s1 is greater than or equal to s2.
  393. */
  394. template<typename T1, typename T2>
  395. static inline bool sstr_geq(const T1& s1, const T2& s2, bool endlt = true) {
  396. size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
  397. size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
  398. size_t minlen = (len1 < len2 ? len1 : len2);
  399. for(size_t i = 0; i < minlen; i++) {
  400. if(s1[i] > s2[i]) {
  401. return true;
  402. } else if(s1[i] < s2[i]) {
  403. return false;
  404. }
  405. }
  406. if(len1 == len2) return true;
  407. return (len1 > len2) == endlt;
  408. }
  409. /**
  410. * Return true iff the given suffix of s1 is greater than or equal to the given
  411. * suffix of s2.
  412. */
  413. template<typename T1, typename T2>
  414. static inline bool sstr_suf_geq(
  415. const T1& s1, size_t suf1,
  416. const T2& s2, size_t suf2,
  417. bool endlt = true)
  418. {
  419. assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
  420. assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
  421. size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
  422. size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
  423. size_t minlen = (len1 < len2 ? len1 : len2);
  424. for(size_t i = 0; i < minlen; i++) {
  425. if(s1[suf1+i] > s2[suf2+i]) {
  426. return true;
  427. } else if(s1[suf1+i] < s2[suf2+i]) {
  428. return false;
  429. }
  430. }
  431. if(len1 == len2) return true;
  432. return (len1 > len2) == endlt;
  433. }
  434. /**
  435. * Return true iff the given prefix of s1 is greater than or equal to the given
  436. * prefix of s2.
  437. */
  438. template<typename T1, typename T2>
  439. static inline bool sstr_pre_geq(
  440. const T1& s1, size_t pre1,
  441. const T2& s2, size_t pre2,
  442. bool endlt = true)
  443. {
  444. assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
  445. assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
  446. size_t len1 = pre1;
  447. size_t len2 = pre2;
  448. size_t minlen = (len1 < len2 ? len1 : len2);
  449. for(size_t i = 0; i < minlen; i++) {
  450. if(s1[i] > s2[i]) {
  451. return true;
  452. } else if(s1[i] < s2[i]) {
  453. return false;
  454. }
  455. }
  456. if(len1 == len2) return true;
  457. return (len1 > len2) == endlt;
  458. }
  459. template<typename T>
  460. static inline const char * sstr_to_cstr(const T& s) {
  461. return s.toZBuf();
  462. }
  463. template<>
  464. inline const char * sstr_to_cstr<std::basic_string<char> >(
  465. const std::basic_string<char>& s)
  466. {
  467. return s.c_str();
  468. }
  469. /**
  470. * Simple string class with backing memory whose size is managed by the user
  471. * using the constructor and install() member function. No behind-the-scenes
  472. * reallocation or copying takes place.
  473. */
  474. template<typename T>
  475. class SString {
  476. public:
  477. explicit SString() :
  478. cs_(NULL),
  479. printcs_(NULL),
  480. len_(0)
  481. { }
  482. explicit SString(size_t sz) :
  483. cs_(NULL),
  484. printcs_(NULL),
  485. len_(0)
  486. {
  487. resize(sz);
  488. }
  489. /**
  490. * Create an SStringExpandable from another SStringExpandable.
  491. */
  492. SString(const SString<T>& o) :
  493. cs_(NULL),
  494. printcs_(NULL),
  495. len_(0)
  496. {
  497. *this = o;
  498. }
  499. /**
  500. * Create an SStringExpandable from a std::basic_string of the
  501. * appropriate type.
  502. */
  503. explicit SString(const std::basic_string<T>& str) :
  504. cs_(NULL),
  505. printcs_(NULL),
  506. len_(0)
  507. {
  508. install(str.c_str(), str.length());
  509. }
  510. /**
  511. * Create an SStringExpandable from an array and size.
  512. */
  513. explicit SString(const T* b, size_t sz) :
  514. cs_(NULL),
  515. printcs_(NULL),
  516. len_(0)
  517. {
  518. install(b, sz);
  519. }
  520. /**
  521. * Create an SStringExpandable from a zero-terminated array.
  522. */
  523. explicit SString(const T* b) :
  524. cs_(NULL),
  525. printcs_(NULL),
  526. len_(0)
  527. {
  528. install(b, strlen(b));
  529. }
  530. /**
  531. * Destroy the expandable string object.
  532. */
  533. virtual ~SString() {
  534. if(cs_ != NULL) {
  535. delete[] cs_;
  536. cs_ = NULL;
  537. }
  538. if(printcs_ != NULL) {
  539. delete[] printcs_;
  540. printcs_ = NULL;
  541. }
  542. len_ = 0;
  543. }
  544. /**
  545. * Assignment to other SString.
  546. */
  547. SString<T>& operator=(const SString<T>& o) {
  548. install(o.cs_, o.len_);
  549. return *this;
  550. }
  551. /**
  552. * Assignment to other SString.
  553. */
  554. SString<T>& operator=(const std::basic_string<T>& o) {
  555. install(o);
  556. return *this;
  557. }
  558. /**
  559. * Resizes the string without preserving its contents.
  560. */
  561. void resize(size_t sz) {
  562. if(cs_ != NULL) {
  563. delete cs_;
  564. cs_ = NULL;
  565. }
  566. if(printcs_ != NULL) {
  567. delete printcs_;
  568. printcs_ = NULL;
  569. }
  570. if(sz != 0) {
  571. cs_ = new T[sz+1];
  572. }
  573. len_ = sz;
  574. }
  575. /**
  576. * Return ith character from the left of either the forward or the
  577. * reverse version of the read.
  578. */
  579. T windowGet(
  580. size_t i,
  581. bool fw,
  582. size_t depth = 0,
  583. size_t len = 0) const
  584. {
  585. if(len == 0) len = len_;
  586. assert_lt(i, len);
  587. assert_leq(len, len_ - depth);
  588. return fw ? cs_[depth+i] : cs_[depth+len-i-1];
  589. }
  590. /**
  591. * Return ith character from the left of either the forward or the
  592. * reverse-complement version of the read.
  593. */
  594. void windowGet(
  595. T& ret,
  596. bool fw,
  597. size_t depth = 0,
  598. size_t len = 0) const
  599. {
  600. if(len == 0) len = len_;
  601. assert_leq(len, len_ - depth);
  602. ret.resize(len);
  603. for(size_t i = 0; i < len; i++) {
  604. ret.set(fw ? cs_[depth+i] : cs_[depth+len-i-1], i);
  605. }
  606. }
  607. /**
  608. * Set character at index 'idx' to 'c'.
  609. */
  610. inline void set(int c, size_t idx) {
  611. assert_lt(idx, len_);
  612. cs_[idx] = c;
  613. }
  614. /**
  615. * Retrieve constant version of element i.
  616. */
  617. inline const T& operator[](size_t i) const {
  618. assert_lt(i, len_);
  619. return cs_[i];
  620. }
  621. /**
  622. * Retrieve mutable version of element i.
  623. */
  624. inline T& operator[](size_t i) {
  625. assert_lt(i, len_);
  626. return cs_[i];
  627. }
  628. /**
  629. * Retrieve constant version of element i.
  630. */
  631. inline const T& get(size_t i) const {
  632. assert_lt(i, len_);
  633. return cs_[i];
  634. }
  635. /**
  636. * Copy 'sz' bytes from buffer 'b' into this string. memcpy is used, not
  637. * operator=.
  638. */
  639. virtual void install(const T* b, size_t sz) {
  640. if(sz == 0) return;
  641. resize(sz);
  642. memcpy(cs_, b, sz * sizeof(T));
  643. }
  644. /**
  645. * Copy 'sz' bytes from buffer 'b' into this string. memcpy is used, not
  646. * operator=.
  647. */
  648. virtual void install(const std::basic_string<T>& b) {
  649. size_t sz = b.length();
  650. if(sz == 0) return;
  651. resize(sz);
  652. memcpy(cs_, b.c_str(), sz * sizeof(T));
  653. }
  654. /**
  655. * Copy all bytes from zero-terminated buffer 'b' into this string.
  656. */
  657. void install(const T* b) {
  658. install(b, strlen(b));
  659. }
  660. /**
  661. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  662. * in the process.
  663. */
  664. void installReverse(const char* b, size_t sz) {
  665. if(sz == 0) return;
  666. resize(sz);
  667. for(size_t i = 0; i < sz; i++) {
  668. cs_[i] = b[sz-i-1];
  669. }
  670. len_ = sz;
  671. }
  672. /**
  673. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  674. * in the process.
  675. */
  676. void installReverse(const SString<T>& b) {
  677. installReverse(b.cs_, b.len_);
  678. }
  679. /**
  680. * Return true iff the two strings are equal.
  681. */
  682. bool operator==(const SString<T>& o) {
  683. return sstr_eq(*this, o);
  684. }
  685. /**
  686. * Return true iff the two strings are not equal.
  687. */
  688. bool operator!=(const SString<T>& o) {
  689. return sstr_neq(*this, o);
  690. }
  691. /**
  692. * Return true iff this string is less than given string.
  693. */
  694. bool operator<(const SString<T>& o) {
  695. return sstr_lt(*this, o);
  696. }
  697. /**
  698. * Return true iff this string is greater than given string.
  699. */
  700. bool operator>(const SString<T>& o) {
  701. return sstr_gt(*this, o);
  702. }
  703. /**
  704. * Return true iff this string is less than or equal to given string.
  705. */
  706. bool operator<=(const SString<T>& o) {
  707. return sstr_leq(*this, o);
  708. }
  709. /**
  710. * Return true iff this string is greater than or equal to given string.
  711. */
  712. bool operator>=(const SString<T>& o) {
  713. return sstr_geq(*this, o);
  714. }
  715. /**
  716. * Reverse the buffer in place.
  717. */
  718. void reverse() {
  719. for(size_t i = 0; i < (len_ >> 1); i++) {
  720. T tmp = get(i);
  721. set(get(len_-i-1), i);
  722. set(tmp, len_-i-1);
  723. }
  724. }
  725. /**
  726. * Reverse a substring of the buffer in place.
  727. */
  728. void reverseWindow(size_t off, size_t len) {
  729. assert_leq(off, len_);
  730. assert_leq(off + len, len_);
  731. size_t mid = len >> 1;
  732. for(size_t i = 0; i < mid; i++) {
  733. T tmp = get(off+i);
  734. set(get(off+len-i-1), off+i);
  735. set(tmp, off+len-i-1);
  736. }
  737. }
  738. /**
  739. * Set the first len elements of the buffer to el.
  740. */
  741. void fill(size_t len, const T& el) {
  742. assert_leq(len, len_);
  743. for(size_t i = 0; i < len; i++) {
  744. set(el, i);
  745. }
  746. }
  747. /**
  748. * Set all elements of the buffer to el.
  749. */
  750. void fill(const T& el) {
  751. fill(len_, el);
  752. }
  753. /**
  754. * Return the length of the string.
  755. */
  756. inline size_t length() const { return len_; }
  757. /**
  758. * Clear the buffer.
  759. */
  760. void clear() { len_ = 0; }
  761. /**
  762. * Return true iff the buffer is empty.
  763. */
  764. inline bool empty() const { return len_ == 0; }
  765. /**
  766. * Put a terminator in the 'len_'th element and then return a
  767. * pointer to the buffer. Useful for printing.
  768. */
  769. const char* toZBufXForm(const char *xform) const {
  770. ASSERT_ONLY(size_t xformElts = strlen(xform));
  771. // Lazily allocate space for print buffer
  772. if(printcs_ == NULL) {
  773. const_cast<char*&>(printcs_) = new char[len_+1];
  774. }
  775. char* printcs = const_cast<char*>(printcs_);
  776. assert(printcs != NULL);
  777. for(size_t i = 0; i < len_; i++) {
  778. assert_lt(cs_[i], (int)xformElts);
  779. printcs[i] = xform[cs_[i]];
  780. }
  781. printcs[len_] = 0;
  782. return printcs_;
  783. }
  784. /**
  785. * Put a terminator in the 'len_'th element and then return a
  786. * pointer to the buffer. Useful for printing.
  787. */
  788. virtual const T* toZBuf() const {
  789. const_cast<T*>(cs_)[len_] = 0;
  790. return cs_;
  791. }
  792. /**
  793. * Return a const version of the raw buffer.
  794. */
  795. const T* buf() const { return cs_; }
  796. /**
  797. * Return a writeable version of the raw buffer.
  798. */
  799. T* wbuf() { return cs_; }
  800. protected:
  801. T *cs_; // +1 so that we have the option of dropping in a terminating "\0"
  802. char *printcs_; // +1 so that we have the option of dropping in a terminating "\0"
  803. size_t len_; // # elements
  804. };
  805. /**
  806. * Simple string class with backing memory whose size is managed by the user
  807. * using the constructor and install() member function. No behind-the-scenes
  808. * reallocation or copying takes place.
  809. */
  810. class S2bDnaString {
  811. public:
  812. explicit S2bDnaString() :
  813. cs_(NULL),
  814. printcs_(NULL),
  815. len_(0)
  816. { }
  817. explicit S2bDnaString(size_t sz) :
  818. cs_(NULL),
  819. printcs_(NULL),
  820. len_(0)
  821. {
  822. resize(sz);
  823. }
  824. /**
  825. * Copy another object of the same class.
  826. */
  827. S2bDnaString(const S2bDnaString& o) :
  828. cs_(NULL),
  829. printcs_(NULL),
  830. len_(0)
  831. {
  832. *this = o;
  833. }
  834. /**
  835. * Create an SStringExpandable from a std::basic_string of the
  836. * appropriate type.
  837. */
  838. explicit S2bDnaString(
  839. const std::basic_string<char>& str,
  840. bool chars = false,
  841. bool colors = false) :
  842. cs_(NULL),
  843. printcs_(NULL),
  844. len_(0)
  845. {
  846. if(chars) {
  847. if(colors) {
  848. installColors(str.c_str(), str.length());
  849. } else {
  850. installChars(str.c_str(), str.length());
  851. }
  852. } else {
  853. install(str.c_str(), str.length());
  854. }
  855. }
  856. /**
  857. * Create an SStringExpandable from an array and size.
  858. */
  859. explicit S2bDnaString(
  860. const char* b,
  861. size_t sz,
  862. bool chars = false,
  863. bool colors = false) :
  864. cs_(NULL),
  865. printcs_(NULL),
  866. len_(0)
  867. {
  868. if(chars) {
  869. if(colors) {
  870. installColors(b, sz);
  871. } else {
  872. installChars(b, sz);
  873. }
  874. } else {
  875. install(b, sz);
  876. }
  877. }
  878. /**
  879. * Create an SStringFixed from a zero-terminated string.
  880. */
  881. explicit S2bDnaString(
  882. const char* b,
  883. bool chars = false,
  884. bool colors = false) :
  885. cs_(NULL),
  886. printcs_(NULL),
  887. len_(0)
  888. {
  889. if(chars) {
  890. if(colors) {
  891. installColors(b, strlen(b));
  892. } else {
  893. installChars(b, strlen(b));
  894. }
  895. } else {
  896. install(b, strlen(b));
  897. }
  898. }
  899. /**
  900. * Destroy the expandable string object.
  901. */
  902. virtual ~S2bDnaString() {
  903. if(cs_ != NULL) {
  904. delete[] cs_;
  905. cs_ = NULL;
  906. }
  907. if(printcs_ != NULL) {
  908. delete[] printcs_;
  909. printcs_ = NULL;
  910. }
  911. len_ = 0;
  912. }
  913. /**
  914. * Assignment to other SString.
  915. */
  916. template<typename T>
  917. S2bDnaString& operator=(const T& o) {
  918. install(o.c_str(), o.length());
  919. return *this;
  920. }
  921. /**
  922. * Assignment from a std::basic_string
  923. */
  924. template<typename T>
  925. S2bDnaString& operator=(const std::basic_string<char>& o) {
  926. install(o);
  927. return *this;
  928. }
  929. /**
  930. * Resizes the string without preserving its contents.
  931. */
  932. void resize(size_t sz) {
  933. if(cs_ != NULL) {
  934. delete cs_;
  935. cs_ = NULL;
  936. }
  937. if(printcs_ != NULL) {
  938. delete printcs_;
  939. printcs_ = NULL;
  940. }
  941. len_ = sz;
  942. if(sz != 0) {
  943. cs_ = new uint32_t[nwords()];
  944. }
  945. }
  946. /**
  947. * Return DNA character corresponding to element 'idx'.
  948. */
  949. char toChar(size_t idx) const {
  950. int c = (int)get(idx);
  951. assert_range(0, 3, c);
  952. return "ACGT"[c];
  953. }
  954. /**
  955. * Return color character corresponding to element 'idx'.
  956. */
  957. char toColor(size_t idx) const {
  958. int c = (int)get(idx);
  959. assert_range(0, 3, c);
  960. return "0123"[c];
  961. }
  962. /**
  963. * Return ith character from the left of either the forward or the
  964. * reverse version of the read.
  965. */
  966. char windowGet(
  967. size_t i,
  968. bool fw,
  969. size_t depth = 0,
  970. size_t len = 0) const
  971. {
  972. if(len == 0) len = len_;
  973. assert_lt(i, len);
  974. assert_leq(len, len_ - depth);
  975. return fw ? get(depth+i) : get(depth+len-i-1);
  976. }
  977. /**
  978. * Return ith character from the left of either the forward or the
  979. * reverse-complement version of the read.
  980. */
  981. template<typename T>
  982. void windowGet(
  983. T& ret,
  984. bool fw,
  985. size_t depth = 0,
  986. size_t len = 0) const
  987. {
  988. if(len == 0) len = len_;
  989. assert_leq(len, len_ - depth);
  990. ret.resize(len);
  991. for(size_t i = 0; i < len; i++) {
  992. ret.set((fw ? get(depth+i) : get(depth+len-i-1)), i);
  993. }
  994. }
  995. /**
  996. * Return length in 32-bit words.
  997. */
  998. size_t nwords() const {
  999. return (len_ + 15) >> 4;
  1000. }
  1001. /**
  1002. * Set character at index 'idx' to 'c'.
  1003. */
  1004. void set(int c, size_t idx) {
  1005. assert_lt(idx, len_);
  1006. assert_range(0, 3, c);
  1007. size_t word = idx >> 4;
  1008. size_t bpoff = (idx & 15) << 1;
  1009. cs_[word] = cs_[word] & ~(uint32_t)(3 << bpoff);
  1010. cs_[word] = cs_[word] | (uint32_t)(c << bpoff);
  1011. }
  1012. /**
  1013. * Set character at index 'idx' to DNA char 'c'.
  1014. */
  1015. void setChar(int c, size_t idx) {
  1016. assert_in(toupper(c), "ACGT");
  1017. int bp = asc2dna[c];
  1018. set(bp, idx);
  1019. }
  1020. /**
  1021. * Set character at index 'idx' to color char 'c'.
  1022. */
  1023. void setColor(int c, size_t idx) {
  1024. assert_in(toupper(c), "0123");
  1025. int co = asc2col[c];
  1026. set(co, idx);
  1027. }
  1028. /**
  1029. * Set the ith 32-bit word to given word.
  1030. */
  1031. void setWord(uint32_t w, size_t i) {
  1032. assert_lt(i, nwords());
  1033. cs_[i] = w;
  1034. }
  1035. /**
  1036. * Retrieve constant version of element i.
  1037. */
  1038. char operator[](size_t i) const {
  1039. assert_lt(i, len_);
  1040. return get(i);
  1041. }
  1042. /**
  1043. * Retrieve constant version of element i.
  1044. */
  1045. char get(size_t i) const {
  1046. assert_lt(i, len_);
  1047. size_t word = i >> 4;
  1048. size_t bpoff = (i & 15) << 1;
  1049. return (char)((cs_[word] >> bpoff) & 3);
  1050. }
  1051. /**
  1052. * Copy packed words from string 'b' into this packed string.
  1053. */
  1054. void install(const uint32_t* b, size_t sz) {
  1055. if(sz == 0) return;
  1056. resize(sz);
  1057. memcpy(cs_, b, sizeof(uint32_t)*nwords());
  1058. }
  1059. /**
  1060. * Copy 'sz' DNA characters encoded as integers from buffer 'b' into this
  1061. * packed string.
  1062. */
  1063. void install(const char* b, size_t sz) {
  1064. if(sz == 0) return;
  1065. resize(sz);
  1066. size_t wordi = 0;
  1067. for(size_t i = 0; i < sz; i += 16) {
  1068. uint32_t word = 0;
  1069. for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
  1070. uint32_t bp = (int)b[i+j];
  1071. uint32_t shift = (uint32_t)j << 1;
  1072. assert_range(0, 3, (int)bp);
  1073. word |= (bp << shift);
  1074. }
  1075. cs_[wordi++] = word;
  1076. }
  1077. }
  1078. /**
  1079. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1080. */
  1081. void installChars(const char* b, size_t sz) {
  1082. if(sz == 0) return;
  1083. resize(sz);
  1084. size_t wordi = 0;
  1085. for(size_t i = 0; i < sz; i += 16) {
  1086. uint32_t word = 0;
  1087. for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
  1088. char c = b[i+j];
  1089. assert_in(toupper(c), "ACGT");
  1090. int bp = asc2dna[(int)c];
  1091. assert_range(0, 3, (int)bp);
  1092. uint32_t shift = (uint32_t)j << 1;
  1093. word |= (bp << shift);
  1094. }
  1095. cs_[wordi++] = word;
  1096. }
  1097. }
  1098. /**
  1099. * Copy 'sz' color characters from buffer 'b' into this packed string.
  1100. */
  1101. void installColors(const char* b, size_t sz) {
  1102. if(sz == 0) return;
  1103. resize(sz);
  1104. size_t wordi = 0;
  1105. for(size_t i = 0; i < sz; i += 16) {
  1106. uint32_t word = 0;
  1107. for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
  1108. char c = b[i+j];
  1109. assert_in(c, "0123");
  1110. int bp = asc2col[(int)c];
  1111. assert_range(0, 3, (int)bp);
  1112. uint32_t shift = (uint32_t)j << 1;
  1113. word |= (bp << shift);
  1114. }
  1115. cs_[wordi++] = word;
  1116. }
  1117. }
  1118. /**
  1119. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1120. */
  1121. void install(const char* b) {
  1122. install(b, strlen(b));
  1123. }
  1124. /**
  1125. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1126. */
  1127. void installChars(const char* b) {
  1128. installChars(b, strlen(b));
  1129. }
  1130. /**
  1131. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1132. */
  1133. void installColors(const char* b) {
  1134. installColors(b, strlen(b));
  1135. }
  1136. /**
  1137. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1138. */
  1139. void install(const std::basic_string<char>& b) {
  1140. install(b.c_str(), b.length());
  1141. }
  1142. /**
  1143. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1144. */
  1145. void installChars(const std::basic_string<char>& b) {
  1146. installChars(b.c_str(), b.length());
  1147. }
  1148. /**
  1149. * Copy 'sz' DNA characters from buffer 'b' into this packed string.
  1150. */
  1151. void installColors(const std::basic_string<char>& b) {
  1152. installColors(b.c_str(), b.length());
  1153. }
  1154. /**
  1155. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  1156. * in the process.
  1157. */
  1158. void installReverse(const char* b, size_t sz) {
  1159. resize(sz);
  1160. if(sz == 0) return;
  1161. size_t wordi = 0;
  1162. size_t bpi = 0;
  1163. cs_[0] = 0;
  1164. for(size_t i =sz; i > 0; i--) {
  1165. assert_range(0, 3, (int)b[i-1]);
  1166. cs_[wordi] |= ((int)b[i-1] << (bpi<<1));
  1167. if(bpi == 15) {
  1168. wordi++;
  1169. cs_[wordi] = 0;
  1170. bpi = 0;
  1171. } else bpi++;
  1172. }
  1173. }
  1174. /**
  1175. * Copy all chars from buffer of DNA characters 'b' into this string,
  1176. * reversing them in the process.
  1177. */
  1178. void installReverse(const char* b) {
  1179. installReverse(b, strlen(b));
  1180. }
  1181. /**
  1182. * Copy 'sz' bytes from buffer of DNA characters 'b' into this string,
  1183. * reversing them in the process.
  1184. */
  1185. void installReverseChars(const char* b, size_t sz) {
  1186. resize(sz);
  1187. if(sz == 0) return;
  1188. size_t wordi = 0;
  1189. size_t bpi = 0;
  1190. cs_[0] = 0;
  1191. for(size_t i =sz; i > 0; i--) {
  1192. char c = b[i-1];
  1193. assert_in(toupper(c), "ACGT");
  1194. int bp = asc2dna[(int)c];
  1195. assert_range(0, 3, bp);
  1196. cs_[wordi] |= (bp << (bpi<<1));
  1197. if(bpi == 15) {
  1198. wordi++;
  1199. cs_[wordi] = 0;
  1200. bpi = 0;
  1201. } else bpi++;
  1202. }
  1203. }
  1204. /**
  1205. * Copy all chars from buffer of DNA characters 'b' into this string,
  1206. * reversing them in the process.
  1207. */
  1208. void installReverseChars(const char* b) {
  1209. installReverseChars(b, strlen(b));
  1210. }
  1211. /**
  1212. * Copy 'sz' bytes from buffer of color characters 'b' into this string,
  1213. * reversing them in the process.
  1214. */
  1215. void installReverseColors(const char* b, size_t sz) {
  1216. resize(sz);
  1217. if(sz == 0) return;
  1218. size_t wordi = 0;
  1219. size_t bpi = 0;
  1220. cs_[0] = 0;
  1221. for(size_t i =sz; i > 0; i--) {
  1222. char c = b[i-1];
  1223. assert_in(c, "0123");
  1224. int bp = asc2col[(int)c];
  1225. assert_range(0, 3, bp);
  1226. cs_[wordi] |= (bp << (bpi<<1));
  1227. if(bpi == 15) {
  1228. wordi++;
  1229. cs_[wordi] = 0;
  1230. bpi = 0;
  1231. } else bpi++;
  1232. }
  1233. }
  1234. /**
  1235. * Copy all chars from buffer of color characters 'b' into this string,
  1236. * reversing them in the process.
  1237. */
  1238. void installReverseColors(const char* b) {
  1239. installReverseColors(b, strlen(b));
  1240. }
  1241. /**
  1242. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  1243. * in the process.
  1244. */
  1245. void installReverse(const S2bDnaString& b) {
  1246. resize(b.len_);
  1247. if(b.len_ == 0) return;
  1248. size_t wordi = 0;
  1249. size_t bpi = 0;
  1250. size_t wordb = b.nwords()-1;
  1251. size_t bpb = (b.len_-1) & 15;
  1252. cs_[0] = 0;
  1253. for(size_t i = b.len_; i > 0; i--) {
  1254. int bbp = (int)((b[wordb] >> (bpb << 1)) & 3);
  1255. assert_range(0, 3, bbp);
  1256. cs_[wordi] |= (bbp << (bpi << 1));
  1257. if(bpi == 15) {
  1258. wordi++;
  1259. cs_[wordi] = 0;
  1260. bpi = 0;
  1261. } else bpi++;
  1262. if(bpb == 0) {
  1263. wordb--;
  1264. bpi = 15;
  1265. } else bpi--;
  1266. }
  1267. }
  1268. /**
  1269. * Return true iff the two strings are equal.
  1270. */
  1271. bool operator==(const S2bDnaString& o) {
  1272. return sstr_eq(*this, o);
  1273. }
  1274. /**
  1275. * Return true iff the two strings are not equal.
  1276. */
  1277. bool operator!=(const S2bDnaString& o) {
  1278. return sstr_neq(*this, o);
  1279. }
  1280. /**
  1281. * Return true iff this string is less than given string.
  1282. */
  1283. bool operator<(const S2bDnaString& o) {
  1284. return sstr_lt(*this, o);
  1285. }
  1286. /**
  1287. * Return true iff this string is greater than given string.
  1288. */
  1289. bool operator>(const S2bDnaString& o) {
  1290. return sstr_gt(*this, o);
  1291. }
  1292. /**
  1293. * Return true iff this string is less than or equal to given string.
  1294. */
  1295. bool operator<=(const S2bDnaString& o) {
  1296. return sstr_leq(*this, o);
  1297. }
  1298. /**
  1299. * Return true iff this string is greater than or equal to given string.
  1300. */
  1301. bool operator>=(const S2bDnaString& o) {
  1302. return sstr_geq(*this, o);
  1303. }
  1304. /**
  1305. * Reverse the 2-bit encoded DNA string in-place.
  1306. */
  1307. void reverse() {
  1308. if(len_ <= 1) return;
  1309. size_t wordf = nwords()-1;
  1310. size_t bpf = (len_-1) & 15;
  1311. size_t wordi = 0;
  1312. size_t bpi = 0;
  1313. while(wordf > wordi || (wordf == wordi && bpf > bpi)) {
  1314. int f = (cs_[wordf] >> (bpf << 1)) & 3;
  1315. int i = (cs_[wordi] >> (bpi << 1)) & 3;
  1316. cs_[wordf] &= ~(uint32_t)(3 << (bpf << 1));
  1317. cs_[wordi] &= ~(uint32_t)(3 << (bpi << 1));
  1318. cs_[wordf] |= (uint32_t)(i << (bpf << 1));
  1319. cs_[wordi] |= (uint32_t)(f << (bpi << 1));
  1320. if(bpf == 0) {
  1321. bpf = 15;
  1322. wordf--;
  1323. } else bpf--;
  1324. if(bpi == 15) {
  1325. bpi = 0;
  1326. wordi++;
  1327. } else bpi++;
  1328. }
  1329. }
  1330. /**
  1331. * Reverse a substring of the buffer in place.
  1332. */
  1333. void reverseWindow(size_t off, size_t len) {
  1334. assert_leq(off, len_);
  1335. assert_leq(off+len, len_);
  1336. if(len <= 1) return;
  1337. size_t wordf = (off+len-1) >> 4;
  1338. size_t bpf = (off+len-1) & 15;
  1339. size_t wordi = (off ) >> 4;
  1340. size_t bpi = (off ) & 15;
  1341. while(wordf > wordi || (wordf == wordi && bpf > bpi)) {
  1342. int f = (cs_[wordf] >> (bpf << 1)) & 3;
  1343. int i = (cs_[wordi] >> (bpi << 1)) & 3;
  1344. cs_[wordf] &= ~(uint32_t)(3 << (bpf << 1));
  1345. cs_[wordi] &= ~(uint32_t)(3 << (bpi << 1));
  1346. cs_[wordf] |= (uint32_t)(i << (bpf << 1));
  1347. cs_[wordi] |= (uint32_t)(f << (bpi << 1));
  1348. if(bpf == 0) {
  1349. bpf = 15;
  1350. wordf--;
  1351. } else bpf--;
  1352. if(bpi == 15) {
  1353. bpi = 0;
  1354. wordi++;
  1355. } else bpi++;
  1356. }
  1357. }
  1358. /**
  1359. * Set the first len elements of the buffer to el.
  1360. */
  1361. void fill(size_t len, char el) {
  1362. assert_leq(len, len_);
  1363. assert_range(0, 3, (int)el);
  1364. size_t word = 0;
  1365. if(len > 32) {
  1366. // Copy el throughout block
  1367. uint32_t bl = (uint32_t)el;
  1368. bl |= (bl << 2);
  1369. bl |= (bl << 4);
  1370. bl |= (bl << 8);
  1371. bl |= (bl << 16);
  1372. // Fill with blocks
  1373. size_t blen = len >> 4;
  1374. for(; word < blen; word++) {
  1375. cs_[word] = bl;
  1376. }
  1377. len = len & 15;
  1378. }
  1379. size_t bp = 0;
  1380. for(size_t i = 0; i < len; i++) {
  1381. cs_[word] &= ~(uint32_t)(3 << (bp << 1));
  1382. cs_[word] |= (uint32_t)(el << (bp << 1));
  1383. if(bp == 15) {
  1384. word++;
  1385. bp = 0;
  1386. } else bp++;
  1387. }
  1388. }
  1389. /**
  1390. * Set all elements of the buffer to el.
  1391. */
  1392. void fill(char el) {
  1393. fill(len_, el);
  1394. }
  1395. /**
  1396. * Return the ith character in the window defined by fw, color, depth and
  1397. * len.
  1398. */
  1399. char windowGetDna(
  1400. size_t i,
  1401. bool fw,
  1402. bool color,
  1403. size_t depth = 0,
  1404. size_t len = 0) const
  1405. {
  1406. if(len == 0) len = len_;
  1407. assert_lt(i, len);
  1408. assert_leq(len, len_ - depth);
  1409. if(fw) {
  1410. return get(depth+i);
  1411. } else {
  1412. return
  1413. color ?
  1414. get(depth+len-i-1) :
  1415. compDna(get(depth+len-i-1));
  1416. }
  1417. }
  1418. /**
  1419. * Fill the given DNA buffer with the substring specified by fw,
  1420. * color, depth and len.
  1421. */
  1422. template<typename T>
  1423. void windowGetDna(
  1424. T& buf,
  1425. bool fw,
  1426. bool color,
  1427. size_t depth = 0,
  1428. size_t len = 0) const
  1429. {
  1430. if(len == 0) len = len_;
  1431. assert_leq(len, len_ - depth);
  1432. buf.resize(len);
  1433. for(size_t i = 0; i < len; i++) {
  1434. buf.set(
  1435. (fw ?
  1436. get(depth+i) :
  1437. (color ?
  1438. get(depth+len-i-1) :
  1439. compDna(get(depth+len-i-1)))), i);
  1440. }
  1441. }
  1442. /**
  1443. * Return the length of the string.
  1444. */
  1445. inline size_t length() const { return len_; }
  1446. /**
  1447. * Clear the buffer.
  1448. */
  1449. void clear() { len_ = 0; }
  1450. /**
  1451. * Return true iff the buffer is empty.
  1452. */
  1453. inline bool empty() const { return len_ == 0; }
  1454. /**
  1455. * Return a const version of the raw buffer.
  1456. */
  1457. const uint32_t* buf() const { return cs_; }
  1458. /**
  1459. * Return a writeable version of the raw buffer.
  1460. */
  1461. uint32_t* wbuf() { return cs_; }
  1462. /**
  1463. * Note: the size of the string once it's stored in the print buffer is 4
  1464. * times as large as the string as stored in compact 2-bit-per-char words.
  1465. */
  1466. const char* toZBuf() const {
  1467. if(printcs_ == NULL) {
  1468. const_cast<char*&>(printcs_) = new char[len_+1];
  1469. }
  1470. char *printcs = const_cast<char*>(printcs_);
  1471. size_t word = 0, bp = 0;
  1472. for(size_t i = 0; i < len_; i++) {
  1473. int c = (cs_[word] >> (bp << 1)) & 3;
  1474. printcs[i] = "ACGT"[c];
  1475. if(bp == 15) {
  1476. word++;
  1477. bp = 0;
  1478. } else bp++;
  1479. }
  1480. printcs[len_] = '\0';
  1481. return printcs_;
  1482. }
  1483. protected:
  1484. uint32_t *cs_; // 2-bit packed words
  1485. char *printcs_;
  1486. size_t len_; // # elements
  1487. };
  1488. /**
  1489. * Simple string class with backing memory that automatically expands as needed.
  1490. */
  1491. template<typename T, int S = 1024, int M = 2>
  1492. class SStringExpandable {
  1493. public:
  1494. explicit SStringExpandable() :
  1495. cs_(NULL),
  1496. printcs_(NULL),
  1497. len_(0),
  1498. sz_(0)
  1499. { }
  1500. explicit SStringExpandable(size_t sz) :
  1501. cs_(NULL),
  1502. printcs_(NULL),
  1503. len_(0),
  1504. sz_(0)
  1505. {
  1506. expandNoCopy(sz);
  1507. }
  1508. /**
  1509. * Create an SStringExpandable from another SStringExpandable.
  1510. */
  1511. SStringExpandable(const SStringExpandable<T, S>& o) :
  1512. cs_(NULL),
  1513. printcs_(NULL),
  1514. len_(0),
  1515. sz_(0)
  1516. {
  1517. *this = o;
  1518. }
  1519. /**
  1520. * Create an SStringExpandable from a std::basic_string of the
  1521. * appropriate type.
  1522. */
  1523. explicit SStringExpandable(const std::basic_string<T>& str) :
  1524. cs_(NULL),
  1525. printcs_(NULL),
  1526. len_(0),
  1527. sz_(0)
  1528. {
  1529. install(str.c_str(), str.length());
  1530. }
  1531. /**
  1532. * Create an SStringExpandable from an array and size.
  1533. */
  1534. explicit SStringExpandable(const T* b, size_t sz) :
  1535. cs_(NULL),
  1536. printcs_(NULL),
  1537. len_(0),
  1538. sz_(0)
  1539. {
  1540. install(b, sz);
  1541. }
  1542. /**
  1543. * Create an SStringExpandable from a zero-terminated array.
  1544. */
  1545. explicit SStringExpandable(const T* b) :
  1546. cs_(NULL),
  1547. printcs_(NULL),
  1548. len_(0),
  1549. sz_(0)
  1550. {
  1551. install(b, strlen(b));
  1552. }
  1553. /**
  1554. * Destroy the expandable string object.
  1555. */
  1556. virtual ~SStringExpandable() {
  1557. if(cs_ != NULL) {
  1558. delete[] cs_;
  1559. cs_ = NULL;
  1560. }
  1561. if(printcs_ != NULL) {
  1562. delete[] printcs_;
  1563. printcs_ = NULL;
  1564. }
  1565. sz_ = len_ = 0;
  1566. }
  1567. /**
  1568. * Return ith character from the left of either the forward or the
  1569. * reverse-complement version of the read.
  1570. */
  1571. T windowGet(
  1572. size_t i,
  1573. bool fw,
  1574. size_t depth = 0,
  1575. size_t len = 0) const
  1576. {
  1577. if(len == 0) len = len_;
  1578. assert_lt(i, len);
  1579. assert_leq(len, len_ - depth);
  1580. return fw ? cs_[depth+i] : cs_[depth+len-i-1];
  1581. }
  1582. /**
  1583. * Return ith character from the left of either the forward or the
  1584. * reverse-complement version of the read.
  1585. */
  1586. void windowGet(
  1587. T& ret,
  1588. bool fw,
  1589. size_t depth = 0,
  1590. size_t len = 0) const
  1591. {
  1592. if(len == 0) len = len_;
  1593. assert_leq(len, len_ - depth);
  1594. for(size_t i = 0; i < len; i++) {
  1595. ret.append(fw ? cs_[depth+i] : cs_[depth+len-i-1]);
  1596. }
  1597. }
  1598. /**
  1599. * Assignment to other SStringFixed.
  1600. */
  1601. SStringExpandable<T,S>& operator=(const SStringExpandable<T,S>& o) {
  1602. install(o.cs_, o.len_);
  1603. return *this;
  1604. }
  1605. /**
  1606. * Assignment from a std::basic_string
  1607. */
  1608. SStringExpandable<T,S>& operator=(const std::basic_string<T>& o) {
  1609. install(o.c_str(), o.length());
  1610. return *this;
  1611. }
  1612. /**
  1613. * Insert char c before position 'idx'; slide subsequent chars down.
  1614. */
  1615. void insert(const T& c, size_t idx) {
  1616. assert_lt(idx, len_);
  1617. if(sz_ < len_ + 1) expandCopy((len_ + 1 + S) * M);
  1618. len_++;
  1619. // Move everyone down by 1
  1620. // len_ is the *new* length
  1621. for(size_t i = len_; i > idx+1; i--) {
  1622. cs_[i-1] = cs_[i-2];
  1623. }
  1624. cs_[idx] = c;
  1625. }
  1626. /**
  1627. * Set character at index 'idx' to 'c'.
  1628. */
  1629. void set(int c, size_t idx) {
  1630. assert_lt(idx, len_);
  1631. cs_[idx] = c;
  1632. }
  1633. /**
  1634. * Append char c.
  1635. */
  1636. void append(const T& c) {
  1637. if(sz_ < len_ + 1) expandCopy((len_ + 1 + S) * M);
  1638. cs_[len_++] = c;
  1639. }
  1640. /**
  1641. * Delete char at position 'idx'; slide subsequent chars up.
  1642. */
  1643. void remove(size_t idx) {
  1644. assert_lt(idx, len_);
  1645. assert_gt(len_, 0);
  1646. for(size_t i = idx; i < len_-1; i++) {
  1647. cs_[i] = cs_[i+1];
  1648. }
  1649. len_--;
  1650. }
  1651. /**
  1652. * Retrieve constant version of element i.
  1653. */
  1654. const T& operator[](size_t i) const {
  1655. assert_lt(i, len_);
  1656. return cs_[i];
  1657. }
  1658. /**
  1659. * Retrieve mutable version of element i.
  1660. */
  1661. T& operator[](size_t i) {
  1662. assert_lt(i, len_);
  1663. return cs_[i];
  1664. }
  1665. /**
  1666. * Retrieve constant version of element i.
  1667. */
  1668. const T& get(size_t i) const {
  1669. assert_lt(i, len_);
  1670. return cs_[i];
  1671. }
  1672. /**
  1673. * Copy 'sz' bytes from buffer 'b' into this string.
  1674. */
  1675. virtual void install(const T* b, size_t sz) {
  1676. if(sz_ < sz) expandNoCopy((sz + S) * M);
  1677. memcpy(cs_, b, sz * sizeof(T));
  1678. len_ = sz;
  1679. }
  1680. /**
  1681. * Copy all bytes from zero-terminated buffer 'b' into this string.
  1682. */
  1683. void install(const T* b) { install(b, strlen(b)); }
  1684. /**
  1685. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  1686. * in the process.
  1687. */
  1688. void installReverse(const char* b, size_t sz) {
  1689. if(sz_ < sz) expandNoCopy((sz + S) * M);
  1690. for(size_t i = 0; i < sz; i++) {
  1691. cs_[i] = b[sz-i-1];
  1692. }
  1693. len_ = sz;
  1694. }
  1695. /**
  1696. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  1697. * in the process.
  1698. */
  1699. void installReverse(const SStringExpandable<T, S>& b) {
  1700. if(sz_ < b.len_) expandNoCopy((b.len_ + S) * M);
  1701. for(size_t i = 0; i < b.len_; i++) {
  1702. cs_[i] = b.cs_[b.len_ - i - 1];
  1703. }
  1704. len_ = b.len_;
  1705. }
  1706. /**
  1707. * Return true iff the two strings are equal.
  1708. */
  1709. bool operator==(const SStringExpandable<T, S>& o) {
  1710. return sstr_eq(*this, o);
  1711. }
  1712. /**
  1713. * Return true iff the two strings are not equal.
  1714. */
  1715. bool operator!=(const SStringExpandable<T, S>& o) {
  1716. return sstr_neq(*this, o);
  1717. }
  1718. /**
  1719. * Return true iff this string is less than given string.
  1720. */
  1721. bool operator<(const SStringExpandable<T, S>& o) {
  1722. return sstr_lt(*this, o);
  1723. }
  1724. /**
  1725. * Return true iff this string is greater than given string.
  1726. */
  1727. bool operator>(const SStringExpandable<T, S>& o) {
  1728. return sstr_gt(*this, o);
  1729. }
  1730. /**
  1731. * Return true iff this string is less than or equal to given string.
  1732. */
  1733. bool operator<=(const SStringExpandable<T, S>& o) {
  1734. return sstr_leq(*this, o);
  1735. }
  1736. /**
  1737. * Return true iff this string is greater than or equal to given string.
  1738. */
  1739. bool operator>=(const SStringExpandable<T, S>& o) {
  1740. return sstr_geq(*this, o);
  1741. }
  1742. /**
  1743. * Reverse the buffer in place.
  1744. */
  1745. void reverse() {
  1746. for(size_t i = 0; i < (len_ >> 1); i++) {
  1747. T tmp = get(i);
  1748. set(get(len_-i-1), i);
  1749. set(tmp, len_-i-1);
  1750. }
  1751. }
  1752. /**
  1753. * Reverse a substring of the buffer in place.
  1754. */
  1755. void reverseWindow(size_t off, size_t len) {
  1756. assert_leq(off, len_);
  1757. assert_leq(off + len, len_);
  1758. size_t mid = len >> 1;
  1759. for(size_t i = 0; i < mid; i++) {
  1760. T tmp = get(off+i);
  1761. set(get(off+len-i-1), off+i);
  1762. set(tmp, off+len-i-1);
  1763. }
  1764. }
  1765. /**
  1766. * Simply resize the buffer. If the buffer is resized to be
  1767. * longer, the newly-added elements will contain garbage and should
  1768. * be initialized immediately.
  1769. */
  1770. void resize(size_t len) {
  1771. if(sz_ < len) expandCopy((len + S) * M);
  1772. len_ = len;
  1773. }
  1774. /**
  1775. * Simply resize the buffer. If the buffer is resized to be
  1776. * longer, new elements will be initialized with 'el'.
  1777. */
  1778. void resize(size_t len, const T& el) {
  1779. if(sz_ < len) expandCopy((len + S) * M);
  1780. if(len > len_) {
  1781. for(size_t i = len_; i < len; i++) {
  1782. cs_[i] = el;
  1783. }
  1784. }
  1785. len_ = len;
  1786. }
  1787. /**
  1788. * Set the first len elements of the buffer to el.
  1789. */
  1790. void fill(size_t len, const T& el) {
  1791. assert_leq(len, len_);
  1792. for(size_t i = 0; i < len; i++) {
  1793. cs_[i] = el;
  1794. }
  1795. }
  1796. /**
  1797. * Set all elements of the buffer to el.
  1798. */
  1799. void fill(const T& el) {
  1800. fill(len_, el);
  1801. }
  1802. /**
  1803. * Trim len characters from the beginning of the string.
  1804. */
  1805. void trimBegin(size_t len) {
  1806. assert_leq(len, len_);
  1807. if(len == len_) {
  1808. len_ = 0; return;
  1809. }
  1810. for(size_t i = 0; i < len_-len; i++) {
  1811. cs_[i] = cs_[i+len];
  1812. }
  1813. len_ -= len;
  1814. }
  1815. /**
  1816. * Trim len characters from the end of the string.
  1817. */
  1818. void trimEnd(size_t len) {
  1819. if(len >= len_) len_ = 0;
  1820. else len_ -= len;
  1821. }
  1822. /**
  1823. * Copy 'sz' bytes from buffer 'b' into this string.
  1824. */
  1825. void append(const T* b, size_t sz) {
  1826. if(sz_ < len_ + sz) expandCopy((len_ + sz + S) * M);
  1827. memcpy(cs_ + len_, b, sz * sizeof(T));
  1828. len_ += sz;
  1829. }
  1830. /**
  1831. * Copy bytes from zero-terminated buffer 'b' into this string.
  1832. */
  1833. void append(const T* b) {
  1834. append(b, strlen(b));
  1835. }
  1836. /**
  1837. * Return the length of the string.
  1838. */
  1839. size_t length() const { return len_; }
  1840. /**
  1841. * Clear the buffer.
  1842. */
  1843. void clear() { len_ = 0; }
  1844. /**
  1845. * Return true iff the buffer is empty.
  1846. */
  1847. bool empty() const { return len_ == 0; }
  1848. /**
  1849. * Put a terminator in the 'len_'th element and then return a
  1850. * pointer to the buffer. Useful for printing.
  1851. */
  1852. const char* toZBufXForm(const char *xform) const {
  1853. ASSERT_ONLY(size_t xformElts = strlen(xform));
  1854. if(empty()) {
  1855. const_cast<char&>(zero_) = 0;
  1856. return &zero_;
  1857. }
  1858. char* printcs = const_cast<char*>(printcs_);
  1859. // Lazily allocate space for print buffer
  1860. for(size_t i = 0; i < len_; i++) {
  1861. assert_lt(cs_[i], (int)xformElts);
  1862. printcs[i] = xform[(int)cs_[i]];
  1863. }
  1864. printcs[len_] = 0;
  1865. return printcs_;
  1866. }
  1867. /**
  1868. * Put a terminator in the 'len_'th element and then return a
  1869. * pointer to the buffer. Useful for printing.
  1870. */
  1871. virtual const T* toZBuf() const {
  1872. if(empty()) {
  1873. const_cast<T&>(zeroT_) = 0;
  1874. return &zeroT_;
  1875. }
  1876. assert_leq(len_, sz_);
  1877. const_cast<T*>(cs_)[len_] = 0;
  1878. return cs_;
  1879. }
  1880. /**
  1881. * Return true iff this DNA string matches the given nucleotide
  1882. * character string.
  1883. */
  1884. bool eq(const char *str) const {
  1885. const char *self = toZBuf();
  1886. return strcmp(str, self) == 0;
  1887. }
  1888. /**
  1889. * Return a const version of the raw buffer.
  1890. */
  1891. const T* buf() const { return cs_; }
  1892. /**
  1893. * Return a writeable version of the raw buffer.
  1894. */
  1895. T* wbuf() { return cs_; }
  1896. protected:
  1897. /**
  1898. * Allocate new, bigger buffer and copy old contents into it. If
  1899. * requested size can be accommodated by current buffer, do nothing.
  1900. */
  1901. void expandCopy(size_t sz) {
  1902. if(sz_ >= sz) return; // done!
  1903. T *tmp = new T[sz + 1];
  1904. char *ptmp = new char[sz + 1];
  1905. if(cs_ != NULL) {
  1906. memcpy(tmp, cs_, sizeof(T)*len_);
  1907. delete[] cs_;
  1908. }
  1909. if(printcs_ != NULL) {
  1910. memcpy(ptmp, printcs_, sizeof(char)*len_);
  1911. delete[] printcs_;
  1912. }
  1913. cs_ = tmp;
  1914. printcs_ = ptmp;
  1915. sz_ = sz;
  1916. }
  1917. /**
  1918. * Allocate new, bigger buffer. If requested size can be
  1919. * accommodated by current buffer, do nothing.
  1920. */
  1921. void expandNoCopy(size_t sz) {
  1922. if(sz_ >= sz) return; // done!
  1923. if(cs_ != NULL) delete[] cs_;
  1924. if(printcs_ != NULL) delete[] printcs_;
  1925. cs_ = new T[sz + 1];
  1926. printcs_ = new char[sz + 1];
  1927. sz_ = sz;
  1928. }
  1929. T *cs_; // +1 so that we have the option of dropping in a terminating "\0"
  1930. char *printcs_; // +1 so that we have the option of dropping in a terminating "\0"
  1931. char zero_; // 0 terminator for empty string
  1932. T zeroT_; // 0 terminator for empty string
  1933. size_t len_; // # filled-in elements
  1934. size_t sz_; // size capacity of cs_
  1935. };
  1936. /**
  1937. * Simple string class with in-object storage.
  1938. *
  1939. * All copies induced by, e.g., operator=, the copy constructor,
  1940. * install() and append(), are shallow (using memcpy/sizeof). If deep
  1941. * copies are needed, use a different class.
  1942. *
  1943. * Reading from an uninitialized element results in an assert as long
  1944. * as NDEBUG is not defined. If NDEBUG is defined, the result is
  1945. * undefined.
  1946. */
  1947. template<typename T, int S>
  1948. class SStringFixed {
  1949. public:
  1950. explicit SStringFixed() : len_(0) { }
  1951. /**
  1952. * Create an SStringFixed from another SStringFixed.
  1953. */
  1954. SStringFixed(const SStringFixed<T, S>& o) {
  1955. *this = o;
  1956. }
  1957. /**
  1958. * Create an SStringFixed from another SStringFixed.
  1959. */
  1960. explicit SStringFixed(const std::basic_string<T>& str) {
  1961. install(str.c_str(), str.length());
  1962. }
  1963. /**
  1964. * Create an SStringFixed from an array and size.
  1965. */
  1966. explicit SStringFixed(const T* b, size_t sz) {
  1967. install(b, sz);
  1968. }
  1969. /**
  1970. * Create an SStringFixed from a zero-terminated string.
  1971. */
  1972. explicit SStringFixed(const T* b) {
  1973. install(b, strlen(b));
  1974. }
  1975. virtual ~SStringFixed() { } // C++ needs this
  1976. /**
  1977. * Retrieve constant version of element i.
  1978. */
  1979. inline const T& operator[](size_t i) const {
  1980. return get(i);
  1981. }
  1982. /**
  1983. * Retrieve mutable version of element i.
  1984. */
  1985. inline T& operator[](size_t i) {
  1986. return get(i);
  1987. }
  1988. /**
  1989. * Retrieve constant version of element i.
  1990. */
  1991. inline const T& get(size_t i) const {
  1992. assert_lt(i, len_);
  1993. return cs_[i];
  1994. }
  1995. /**
  1996. * Retrieve mutable version of element i.
  1997. */
  1998. inline T& get(size_t i) {
  1999. assert_lt(i, len_);
  2000. return cs_[i];
  2001. }
  2002. /**
  2003. * Return ith character from the left of either the forward or the
  2004. * reverse-complement version of the read.
  2005. */
  2006. T windowGet(
  2007. size_t i,
  2008. bool fw,
  2009. size_t depth = 0,
  2010. size_t len = 0) const
  2011. {
  2012. if(len == 0) len = len_;
  2013. assert_lt(i, len);
  2014. assert_leq(len, len_ - depth);
  2015. return fw ? cs_[depth+i] : cs_[depth+len-i-1];
  2016. }
  2017. /**
  2018. * Return ith character from the left of either the forward or the
  2019. * reverse-complement version of the read.
  2020. */
  2021. void windowGet(
  2022. T& ret,
  2023. bool fw,
  2024. size_t depth = 0,
  2025. size_t len = 0) const
  2026. {
  2027. if(len == 0) len = len_;
  2028. assert_leq(len, len_ - depth);
  2029. for(size_t i = 0; i < len; i++) {
  2030. ret.append(fw ? cs_[depth+i] : cs_[depth+len-i-1]);
  2031. }
  2032. }
  2033. /**
  2034. * Assignment to other SStringFixed.
  2035. */
  2036. SStringFixed<T,S>& operator=(const SStringFixed<T,S>& o) {
  2037. install(o.cs_, o.len_);
  2038. return *this;
  2039. }
  2040. /**
  2041. * Assignment from a std::basic_string
  2042. */
  2043. SStringFixed<T,S>& operator=(const std::basic_string<T>& o) {
  2044. install(o);
  2045. return *this;
  2046. }
  2047. /**
  2048. * Insert char c before position 'idx'; slide subsequent chars down.
  2049. */
  2050. void insert(const T& c, size_t idx) {
  2051. assert_lt(len_, S);
  2052. assert_lt(idx, len_);
  2053. // Move everyone down by 1
  2054. for(int i = len_; i > idx; i--) {
  2055. cs_[i] = cs_[i-1];
  2056. }
  2057. cs_[idx] = c;
  2058. len_++;
  2059. }
  2060. /**
  2061. * Set character at index 'idx' to 'c'.
  2062. */
  2063. void set(int c, size_t idx) {
  2064. assert_lt(idx, len_);
  2065. cs_[idx] = c;
  2066. }
  2067. /**
  2068. * Append char c.
  2069. */
  2070. void append(const T& c) {
  2071. assert_lt(len_, S);
  2072. cs_[len_++] = c;
  2073. }
  2074. /**
  2075. * Delete char at position 'idx'; slide subsequent chars up.
  2076. */
  2077. void remove(size_t idx) {
  2078. assert_lt(idx, len_);
  2079. assert_gt(len_, 0);
  2080. for(size_t i = idx; i < len_-1; i++) {
  2081. cs_[i] = cs_[i+1];
  2082. }
  2083. len_--;
  2084. }
  2085. /**
  2086. * Copy 'sz' bytes from buffer 'b' into this string.
  2087. */
  2088. virtual void install(const T* b, size_t sz) {
  2089. assert_leq(sz, S);
  2090. memcpy(cs_, b, sz * sizeof(T));
  2091. len_ = sz;
  2092. }
  2093. /**
  2094. * Copy all bytes from zero-terminated buffer 'b' into this string.
  2095. */
  2096. void install(const T* b) { install(b, strlen(b)); }
  2097. /**
  2098. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  2099. * in the process.
  2100. */
  2101. void installReverse(const char* b, size_t sz) {
  2102. assert_leq(sz, S);
  2103. for(size_t i = 0; i < sz; i++) {
  2104. cs_[i] = b[sz-i-1];
  2105. }
  2106. len_ = sz;
  2107. }
  2108. /**
  2109. * Copy 'sz' bytes from buffer 'b' into this string, reversing them
  2110. * in the process.
  2111. */
  2112. void installReverse(const SStringFixed<T, S>& b) {
  2113. assert_leq(b.len_, S);
  2114. for(size_t i = 0; i < b.len_; i++) {
  2115. cs_[i] = b.cs_[b.len_ - i - 1];
  2116. }
  2117. len_ = b.len_;
  2118. }
  2119. /**
  2120. * Return true iff the two strings are equal.
  2121. */
  2122. bool operator==(const SStringFixed<T, S>& o) {
  2123. return sstr_eq(*this, o);
  2124. }
  2125. /**
  2126. * Return true iff the two strings are not equal.
  2127. */
  2128. bool operator!=(const SStringFixed<T, S>& o) {
  2129. return sstr_neq(*this, o);
  2130. }
  2131. /**
  2132. * Return true iff this string is less than given string.
  2133. */
  2134. bool operator<(const SStringFixed<T, S>& o) {
  2135. return sstr_lt(*this, o);
  2136. }
  2137. /**
  2138. * Return true iff this string is greater than given string.
  2139. */
  2140. bool operator>(const SStringFixed<T, S>& o) {
  2141. return sstr_gt(*this, o);
  2142. }
  2143. /**
  2144. * Return true iff this string is less than or equal to given string.
  2145. */
  2146. bool operator<=(const SStringFixed<T, S>& o) {
  2147. return sstr_leq(*this, o);
  2148. }
  2149. /**
  2150. * Return true iff this string is greater than or equal to given string.
  2151. */
  2152. bool operator>=(const SStringFixed<T, S>& o) {
  2153. return sstr_geq(*this, o);
  2154. }
  2155. /**
  2156. * Reverse the buffer in place.
  2157. */
  2158. void reverse() {
  2159. for(size_t i = 0; i < (len_ >> 1); i++) {
  2160. T tmp = get(i);
  2161. set(get(len_-i-1), i);
  2162. set(tmp, len_-i-1);
  2163. }
  2164. }
  2165. /**
  2166. * Reverse a substring of the buffer in place.
  2167. */
  2168. void reverseWindow(size_t off, size_t len) {
  2169. assert_leq(off, len_);
  2170. assert_leq(off + len, len_);
  2171. size_t mid = len >> 1;
  2172. for(size_t i = 0; i < mid; i++) {
  2173. T tmp = get(off+i);
  2174. set(get(off+len-i-1), off+i);
  2175. set(tmp, off+len-i-1);
  2176. }
  2177. }
  2178. /**
  2179. * Simply resize the buffer. If the buffer is resized to be
  2180. * longer, the newly-added elements will contain garbage and should
  2181. * be initialized immediately.
  2182. */
  2183. void resize(size_t len) {
  2184. assert_lt(len, S);
  2185. len_ = len;
  2186. }
  2187. /**
  2188. * Simply resize the buffer. If the buffer is resized to be
  2189. * longer, new elements will be initialized with 'el'.
  2190. */
  2191. void resize(size_t len, const T& el) {
  2192. assert_lt(len, S);
  2193. if(len > len_) {
  2194. for(size_t i = len_; i < len; i++) {
  2195. cs_[i] = el;
  2196. }
  2197. }
  2198. len_ = len;
  2199. }
  2200. /**
  2201. * Set the first len elements of the buffer to el.
  2202. */
  2203. void fill(size_t len, const T& el) {
  2204. assert_leq(len, len_);
  2205. for(size_t i = 0; i < len; i++) {
  2206. cs_[i] = el;
  2207. }
  2208. }
  2209. /**
  2210. * Set all elements of the buffer to el.
  2211. */
  2212. void fill(const T& el) {
  2213. fill(len_, el);
  2214. }
  2215. /**
  2216. * Trim len characters from the beginning of the string.
  2217. */
  2218. void trimBegin(size_t len) {
  2219. assert_leq(len, len_);
  2220. if(len == len_) {
  2221. len_ = 0; return;
  2222. }
  2223. for(size_t i = 0; i < len_-len; i++) {
  2224. cs_[i] = cs_[i+len];
  2225. }
  2226. len_ -= len;
  2227. }
  2228. /**
  2229. * Trim len characters from the end of the string.
  2230. */
  2231. void trimEnd(size_t len) {
  2232. if(len >= len_) len_ = 0;
  2233. else len_ -= len;
  2234. }
  2235. /**
  2236. * Copy 'sz' bytes from buffer 'b' into this string.
  2237. */
  2238. void append(const T* b, size_t sz) {
  2239. assert_leq(sz + len_, S);
  2240. memcpy(cs_ + len_, b, sz * sizeof(T));
  2241. len_ += sz;
  2242. }
  2243. /**
  2244. * Copy bytes from zero-terminated buffer 'b' into this string.
  2245. */
  2246. void append(const T* b) {
  2247. append(b, strlen(b));
  2248. }
  2249. /**
  2250. * Return the length of the string.
  2251. */
  2252. size_t length() const { return len_; }
  2253. /**
  2254. * Clear the buffer.
  2255. */
  2256. void clear() { len_ = 0; }
  2257. /**
  2258. * Return true iff the buffer is empty.
  2259. */
  2260. bool empty() const { return len_ == 0; }
  2261. /**
  2262. * Put a terminator in the 'len_'th element and then return a
  2263. * pointer to the buffer. Useful for printing.
  2264. */
  2265. virtual const T* toZBuf() const {
  2266. const_cast<T*>(cs_)[len_] = 0;
  2267. return cs_;
  2268. }
  2269. /**
  2270. * Return true iff this DNA string matches the given nucleotide
  2271. * character string.
  2272. */
  2273. bool eq(const char *str) const {
  2274. const char *self = toZBuf();
  2275. return strcmp(str, self) == 0;
  2276. }
  2277. /**
  2278. * Put a terminator in the 'len_'th element and then return a
  2279. * pointer to the buffer. Useful for printing.
  2280. */
  2281. const char* toZBufXForm(const char *xform) const {
  2282. ASSERT_ONLY(size_t xformElts = strlen(xform));
  2283. char* printcs = const_cast<char*>(printcs_);
  2284. for(size_t i = 0; i < len_; i++) {
  2285. assert_lt(cs_[i], (int)xformElts);
  2286. printcs[i] = xform[cs_[i]];
  2287. }
  2288. printcs[len_] = 0;
  2289. return printcs_;
  2290. }
  2291. /**
  2292. * Return a const version of the raw buffer.
  2293. */
  2294. const T* buf() const { return cs_; }
  2295. /**
  2296. * Return a writeable version of the raw buffer.
  2297. */
  2298. T* wbuf() { return cs_; }
  2299. protected:
  2300. T cs_[S+1]; // +1 so that we have the option of dropping in a terminating "\0"
  2301. char printcs_[S+1]; // +1 so that we have the option of dropping in a terminating "\0"
  2302. size_t len_;
  2303. };
  2304. //
  2305. // Stream put operators
  2306. //
  2307. template <typename T, int S, int M>
  2308. std::ostream& operator<< (std::ostream& os, const SStringExpandable<T, S, M>& str) {
  2309. os << str.toZBuf();
  2310. return os;
  2311. }
  2312. template <typename T, int S>
  2313. std::ostream& operator<< (std::ostream& os, const SStringFixed<T, S>& str) {
  2314. os << str.toZBuf();
  2315. return os;
  2316. }
  2317. extern uint8_t asc2dna[];
  2318. extern uint8_t asc2col[];
  2319. /**
  2320. * Encapsulates a fixed-length DNA string with characters encoded as
  2321. * chars. Only capable of encoding A, C, G, T and N. The length is
  2322. * specified via the template parameter S.
  2323. */
  2324. template<int S>
  2325. class SDnaStringFixed : public SStringFixed<char, S> {
  2326. public:
  2327. explicit SDnaStringFixed() : SStringFixed<char, S>() { }
  2328. /**
  2329. * Create an SStringFixed from another SStringFixed.
  2330. */
  2331. SDnaStringFixed(const SDnaStringFixed<S>& o) :
  2332. SStringFixed<char, S>(o) { }
  2333. /**
  2334. * Create an SStringFixed from a C++ basic_string.
  2335. */
  2336. explicit SDnaStringFixed(const std::basic_string<char>& str) :
  2337. SStringFixed<char, S>(str) { }
  2338. /**
  2339. * Create an SStringFixed from an array and size.
  2340. */
  2341. explicit SDnaStringFixed(const char* b, size_t sz) :
  2342. SStringFixed<char, S>(b, sz) { }
  2343. /**
  2344. * Create an SStringFixed from a zero-terminated string.
  2345. */
  2346. explicit SDnaStringFixed(
  2347. const char* b,
  2348. bool chars = false,
  2349. bool colors = false) :
  2350. SStringFixed<char, S>()
  2351. {
  2352. if(chars) {
  2353. if(colors) {
  2354. installColors(b, strlen(b));
  2355. } else {
  2356. installChars(b, strlen(b));
  2357. }
  2358. } else {
  2359. install(b, strlen(b));
  2360. }
  2361. }
  2362. virtual ~SDnaStringFixed() { } // C++ needs this
  2363. /**
  2364. * Copy 'sz' bytes from buffer 'b' into this string, reverse-
  2365. * complementing them in the process, assuming an encoding where
  2366. * 0=A, 1=C, 2=G, 3=T, 4=N.
  2367. */
  2368. void installReverseComp(const char* b, size_t sz) {
  2369. assert_leq(sz, S);
  2370. for(size_t i = 0; i < sz; i++) {
  2371. this->cs_[i] = (b[sz-i-1] == 4 ? 4 : b[sz-i-1] ^ 3);
  2372. }
  2373. this->len_ = sz;
  2374. }
  2375. /**
  2376. * Copy 'sz' bytes from buffer 'b' into this string, reverse-
  2377. * complementing them in the process, assuming an encoding where
  2378. * 0=A, 1=C, 2=G, 3=T, 4=N.
  2379. */
  2380. void installReverseComp(const SDnaStringFixed<S>& b) {
  2381. assert_leq(b.len_, S);
  2382. for(size_t i = 0; i < b.len_; i++) {
  2383. this->cs_[i] = (b.cs_[b.len_-i-1] == 4 ? 4 : b.cs_[b.len_-i-1] ^ 3);
  2384. }
  2385. this->len_ = b.len_;
  2386. }
  2387. /**
  2388. * Either reverse or reverse-complement (depending on "color") this
  2389. * DNA buffer in-place.
  2390. */
  2391. void reverseComp(bool color = false) {
  2392. if(color) {
  2393. this->reverse();
  2394. } else {
  2395. for(size_t i = 0; i < (this->len_ >> 1); i++) {
  2396. char tmp1 = (this->cs_[i] == 4 ? 4 : this->cs_[i] ^ 3);
  2397. char tmp2 = (this->cs_[this->len_-i-1] == 4 ? 4 : this->cs_[this->len_-i-1] ^ 3);
  2398. this->cs_[i] = tmp2;
  2399. this->cs_[this->len_-i-1] = tmp1;
  2400. }
  2401. // Do middle element iff there are an odd number
  2402. if((this->len_ & 1) != 0) {
  2403. char tmp = this->cs_[this->len_ >> 1];
  2404. tmp = (tmp == 4 ? 4 : tmp ^ 3);
  2405. this->cs_[this->len_ >> 1] = tmp;
  2406. }
  2407. }
  2408. }
  2409. /**
  2410. * Copy 'sz' bytes from buffer 'b' into this string.
  2411. */
  2412. virtual void install(const char* b, size_t sz) {
  2413. assert_leq(sz, S);
  2414. memcpy(this->cs_, b, sz);
  2415. #ifndef NDEBUG
  2416. for(size_t i = 0; i < sz; i++) {
  2417. assert_leq(this->cs_[i], 4);
  2418. assert_geq(this->cs_[i], 0);
  2419. }
  2420. #endif
  2421. this->len_ = sz;
  2422. }
  2423. /**
  2424. * Copy buffer 'b' of ASCII DNA characters into normal DNA
  2425. * characters.
  2426. */
  2427. virtual void installChars(const char* b, size_t sz) {
  2428. assert_leq(sz, S);
  2429. for(size_t i = 0; i < sz; i++) {
  2430. assert_in(toupper(b[i]), "ACGTN-");
  2431. this->cs_[i] = asc2dna[(int)b[i]];
  2432. assert_geq(this->cs_[i], 0);
  2433. assert_leq(this->cs_[i], 4);
  2434. }
  2435. this->len_ = sz;
  2436. }
  2437. /**
  2438. * Copy buffer 'b' of ASCII color characters into normal DNA
  2439. * characters.
  2440. */
  2441. virtual void installColors(const char* b, size_t sz) {
  2442. assert_leq(sz, S);
  2443. for(size_t i = 0; i < sz; i++) {
  2444. assert_in(b[i], "0123.");
  2445. this->cs_[i] = asc2col[(int)b[i]];
  2446. assert_geq(this->cs_[i], 0);
  2447. assert_leq(this->cs_[i], 4);
  2448. }
  2449. this->len_ = sz;
  2450. }
  2451. /**
  2452. * Copy C++ string of ASCII DNA characters into normal DNA
  2453. * characters.
  2454. */
  2455. virtual void installChars(const std::basic_string<char>& str) {
  2456. installChars(str.c_str(), str.length());
  2457. }
  2458. /**
  2459. * Copy C++ string of ASCII color characters into normal DNA
  2460. * characters.
  2461. */
  2462. virtual void installColors(const std::basic_string<char>& str) {
  2463. installColors(str.c_str(), str.length());
  2464. }
  2465. /**
  2466. * Set DNA character at index 'idx' to 'c'.
  2467. */
  2468. void set(int c, size_t idx) {
  2469. assert_lt(idx, this->len_);
  2470. assert_leq(c, 4);
  2471. assert_geq(c, 0);
  2472. this->cs_[idx] = c;
  2473. }
  2474. /**
  2475. * Append DNA char c.
  2476. */
  2477. void append(const char& c) {
  2478. assert_lt(this->len_, S);
  2479. assert_leq(c, 4);
  2480. assert_geq(c, 0);
  2481. this->cs_[this->len_++] = c;
  2482. }
  2483. /**
  2484. * Set DNA character at index 'idx' to 'c'.
  2485. */
  2486. void setChar(char c, size_t idx) {
  2487. assert_lt(idx, this->len_);
  2488. assert_in(toupper(c), "ACGTN");
  2489. this->cs_[idx] = asc2dna[(int)c];
  2490. }
  2491. /**
  2492. * Append DNA character.
  2493. */
  2494. void appendChar(char c) {
  2495. assert_lt(this->len_, S);
  2496. assert_in(toupper(c), "ACGTN");
  2497. this->cs_[this->len_++] = asc2dna[(int)c];
  2498. }
  2499. /**
  2500. * Return DNA character corresponding to element 'idx'.
  2501. */
  2502. char toChar(size_t idx) const {
  2503. assert_geq((int)this->cs_[idx], 0);
  2504. assert_leq((int)this->cs_[idx], 4);
  2505. return "ACGTN"[(int)this->cs_[idx]];
  2506. }
  2507. /**
  2508. * Retrieve constant version of element i.
  2509. */
  2510. const char& operator[](size_t i) const {
  2511. return this->get(i);
  2512. }
  2513. /**
  2514. * Retrieve constant version of element i.
  2515. */
  2516. const char& get(size_t i) const {
  2517. assert_lt(i, this->len_);
  2518. assert_leq(this->cs_[i], 4);
  2519. assert_geq(this->cs_[i], 0);
  2520. return this->cs_[i];
  2521. }
  2522. /**
  2523. * Return the ith character in the window defined by fw, color,
  2524. * depth and len.
  2525. */
  2526. char windowGetDna(
  2527. size_t i,
  2528. bool fw,
  2529. bool color,
  2530. size_t depth = 0,
  2531. size_t len = 0) const
  2532. {
  2533. if(len == 0) len = this->len_;
  2534. assert_lt(i, len);
  2535. assert_leq(len, this->len_ - depth);
  2536. if(fw) return this->cs_[depth+i];
  2537. else return color ? this->cs_[depth+len-i-1] :
  2538. compDna(this->cs_[depth+len-i-1]);
  2539. }
  2540. /**
  2541. * Fill the given DNA buffer with the substring specified by fw,
  2542. * color, depth and len.
  2543. */
  2544. void windowGetDna(
  2545. SDnaStringFixed<S>& buf,
  2546. bool fw,
  2547. bool color,
  2548. size_t depth = 0,
  2549. size_t len = 0) const
  2550. {
  2551. if(len == 0) len = this->len_;
  2552. assert_leq(len, this->len_ - depth);
  2553. for(size_t i = 0; i < len; i++) {
  2554. buf.append(fw ? this->cs_[depth+i] :
  2555. (color ? this->cs_[depth+len-i-1] :
  2556. compDna(this->cs_[depth+len-i-1])));
  2557. }
  2558. }
  2559. /**
  2560. * Put a terminator in the 'len_'th element and then return a
  2561. * pointer to the buffer. Useful for printing.
  2562. */
  2563. virtual const char* toZBuf() const { return this->toZBufXForm("ACGTN"); }
  2564. };
  2565. /**
  2566. * Encapsulates a fixed-length DNA string with characters encoded as
  2567. * chars. Only capable of encoding A, C, G, T and N. The length is
  2568. * specified via the template parameter S.
  2569. */
  2570. template<int S = 1024, int M = 2>
  2571. class SDnaStringExpandable : public SStringExpandable<char, S, M> {
  2572. public:
  2573. explicit SDnaStringExpandable() : SStringExpandable<char, S, M>() { }
  2574. /**
  2575. * Create an SStringFixed from another SStringFixed.
  2576. */
  2577. SDnaStringExpandable(const SDnaStringExpandable<S, M>& o) :
  2578. SStringExpandable<char, S, M>(o) { }
  2579. /**
  2580. * Create an SStringFixed from a C++ basic_string.
  2581. */
  2582. explicit SDnaStringExpandable(
  2583. const std::basic_string<char>& str,
  2584. bool chars = false,
  2585. bool colors = false) :
  2586. SStringExpandable<char, S, M>()
  2587. {
  2588. if(chars) {
  2589. if(colors) {
  2590. installColors(str);
  2591. } else {
  2592. installChars(str);
  2593. }
  2594. } else {
  2595. install(str);
  2596. }
  2597. }
  2598. /**
  2599. * Create an SStringFixed from an array and size.
  2600. */
  2601. explicit SDnaStringExpandable(
  2602. const char* b,
  2603. size_t sz,
  2604. bool chars = false,
  2605. bool colors = false) :
  2606. SStringExpandable<char, S, M>()
  2607. {
  2608. if(chars) {
  2609. if(colors) {
  2610. installColors(b, sz);
  2611. } else {
  2612. installChars(b, sz);
  2613. }
  2614. } else {
  2615. install(b, sz);
  2616. }
  2617. }
  2618. /**
  2619. * Create an SStringFixed from a zero-terminated string.
  2620. */
  2621. explicit SDnaStringExpandable(
  2622. const char* b,
  2623. bool chars = false,
  2624. bool colors = false) :
  2625. SStringExpandable<char, S, M>()
  2626. {
  2627. install(b, chars, colors);
  2628. }
  2629. virtual ~SDnaStringExpandable() { } // C++ needs this
  2630. /**
  2631. * Copy 'sz' bytes from buffer 'b' into this string, reverse-
  2632. * complementing them in the process, assuming an encoding where
  2633. * 0=A, 1=C, 2=G, 3=T, 4=N.
  2634. */
  2635. void installReverseComp(const char* b, size_t sz) {
  2636. if(this->sz_ < sz) this->expandCopy((sz + S) * M);
  2637. for(size_t i = 0; i < sz; i++) {
  2638. this->cs_[i] = (b[sz-i-1] == 4 ? 4 : b[sz-i-1] ^ 3);
  2639. }
  2640. this->len_ = sz;
  2641. }
  2642. /**
  2643. * Copy 'sz' bytes from buffer 'b' into this string, reverse-
  2644. * complementing them in the process, assuming an encoding where
  2645. * 0=A, 1=C, 2=G, 3=T, 4=N.
  2646. */
  2647. void installReverseComp(const SDnaStringExpandable<S, M>& b) {
  2648. if(this->sz_ < b.len_) this->expandCopy((b.len_ + S) * M);
  2649. for(size_t i = 0; i < b.len_; i++) {
  2650. this->cs_[i] = (b.cs_[b.len_-i-1] == 4 ? 4 : b.cs_[b.len_-i-1] ^ 3);
  2651. }
  2652. this->len_ = b.len_;
  2653. }
  2654. /**
  2655. * Either reverse or reverse-complement (depending on "color") this
  2656. * DNA buffer in-place.
  2657. */
  2658. void reverseComp(bool color = false) {
  2659. if(color) {
  2660. this->reverse();
  2661. } else {
  2662. for(size_t i = 0; i < (this->len_ >> 1); i++) {
  2663. char tmp1 = (this->cs_[i] == 4 ? 4 : this->cs_[i] ^ 3);
  2664. char tmp2 = (this->cs_[this->len_-i-1] == 4 ? 4 : this->cs_[this->len_-i-1] ^ 3);
  2665. this->cs_[i] = tmp2;
  2666. this->cs_[this->len_-i-1] = tmp1;
  2667. }
  2668. // Do middle element iff there are an odd number
  2669. if((this->len_ & 1) != 0) {
  2670. char tmp = this->cs_[this->len_ >> 1];
  2671. tmp = (tmp == 4 ? 4 : tmp ^ 3);
  2672. this->cs_[this->len_ >> 1] = tmp;
  2673. }
  2674. }
  2675. }
  2676. /**
  2677. * Copy 'sz' bytes from buffer 'b' into this string.
  2678. */
  2679. virtual void install(
  2680. const char* b,
  2681. bool chars = false,
  2682. bool colors = false)
  2683. {
  2684. if(chars) {
  2685. if(colors) {
  2686. installColors(b, strlen(b));
  2687. } else {
  2688. installChars(b, strlen(b));
  2689. }
  2690. } else {
  2691. install(b, strlen(b));
  2692. }
  2693. }
  2694. /**
  2695. * Copy 'sz' bytes from buffer 'b' into this string.
  2696. */
  2697. virtual void install(const char* b, size_t sz) {
  2698. if(this->sz_ < sz) this->expandCopy((sz + S) * M);
  2699. memcpy(this->cs_, b, sz);
  2700. #ifndef NDEBUG
  2701. for(size_t i = 0; i < sz; i++) {
  2702. assert_range(0, 4, (int)this->cs_[i]);
  2703. }
  2704. #endif
  2705. this->len_ = sz;
  2706. }
  2707. /**
  2708. * Copy buffer 'b' of ASCII DNA characters into normal DNA
  2709. * characters.
  2710. */
  2711. virtual void installChars(const char* b, size_t sz) {
  2712. if(this->sz_ < sz) this->expandCopy((sz + S) * M);
  2713. for(size_t i = 0; i < sz; i++) {
  2714. assert_in(toupper(b[i]), "ACGTN-");
  2715. this->cs_[i] = asc2dna[(int)b[i]];
  2716. assert_range(0, 4, (int)this->cs_[i]);
  2717. }
  2718. this->len_ = sz;
  2719. }
  2720. /**
  2721. * Copy buffer 'b' of ASCII color characters into normal DNA
  2722. * characters.
  2723. */
  2724. virtual void installColors(const char* b, size_t sz) {
  2725. if(this->sz_ < sz) this->expandCopy((sz + S) * M);
  2726. for(size_t i = 0; i < sz; i++) {
  2727. assert_in(b[i], "0123.");
  2728. this->cs_[i] = asc2col[(int)b[i]];
  2729. assert_range(0, 4, (int)this->cs_[i]);
  2730. }
  2731. this->len_ = sz;
  2732. }
  2733. /**
  2734. * Copy C++ string of ASCII DNA characters into normal DNA
  2735. * characters.
  2736. */
  2737. virtual void installChars(const std::basic_string<char>& str) {
  2738. installChars(str.c_str(), str.length());
  2739. }
  2740. /**
  2741. * Copy C++ string of ASCII color characters into normal DNA
  2742. * characters.
  2743. */
  2744. virtual void installColors(const std::basic_string<char>& str) {
  2745. installColors(str.c_str(), str.length());
  2746. }
  2747. /**
  2748. * Set DNA character at index 'idx' to 'c'.
  2749. */
  2750. void set(int c, size_t idx) {
  2751. assert_lt(idx, this->len_);
  2752. assert_range(0, 4, c);
  2753. this->cs_[idx] = c;
  2754. }
  2755. /**
  2756. * Append DNA char c.
  2757. */
  2758. void append(const char& c) {
  2759. if(this->sz_ < this->len_ + 1) {
  2760. this->expandCopy((this->len_ + 1 + S) * M);
  2761. }
  2762. assert_range(0, 4, (int)c);
  2763. this->cs_[this->len_++] = c;
  2764. }
  2765. /**
  2766. * Set DNA character at index 'idx' to 'c'.
  2767. */
  2768. void setChar(char c, size_t idx) {
  2769. assert_lt(idx, this->len_);
  2770. assert_in(toupper(c), "ACGTN");
  2771. this->cs_[idx] = asc2dna[(int)c];
  2772. }
  2773. /**
  2774. * Append DNA character.
  2775. */
  2776. void appendChar(char c) {
  2777. if(this->sz_ < this->len_ + 1) {
  2778. this->expandCopy((this->len_ + 1 + S) * M);
  2779. }
  2780. assert_in(toupper(c), "ACGTN");
  2781. this->cs_[this->len_++] = asc2dna[(int)c];
  2782. }
  2783. /**
  2784. * Return DNA character corresponding to element 'idx'.
  2785. */
  2786. char toChar(size_t idx) const {
  2787. assert_range(0, 4, (int)this->cs_[idx]);
  2788. return "ACGTN"[(int)this->cs_[idx]];
  2789. }
  2790. /**
  2791. * Retrieve constant version of element i.
  2792. */
  2793. inline const char& operator[](size_t i) const {
  2794. return this->get(i);
  2795. }
  2796. /**
  2797. * Retrieve constant version of element i.
  2798. */
  2799. inline const char& get(size_t i) const {
  2800. assert_lt(i, this->len_);
  2801. assert_range(0, 4, (int)this->cs_[i]);
  2802. return this->cs_[i];
  2803. }
  2804. /**
  2805. * Return the ith character in the window defined by fw, color,
  2806. * depth and len.
  2807. */
  2808. char windowGetDna(
  2809. size_t i,
  2810. bool fw,
  2811. bool color,
  2812. size_t depth = 0,
  2813. size_t len = 0) const
  2814. {
  2815. if(len == 0) len = this->len_;
  2816. assert_lt(i, len);
  2817. assert_leq(len, this->len_ - depth);
  2818. if(fw) return this->cs_[depth+i];
  2819. else return color ? this->cs_[depth+len-i-1] :
  2820. compDna(this->cs_[depth+len-i-1]);
  2821. }
  2822. /**
  2823. * Fill the given DNA buffer with the substring specified by fw,
  2824. * color, depth and len.
  2825. */
  2826. void windowGetDna(
  2827. SDnaStringExpandable<S, M>& buf,
  2828. bool fw,
  2829. bool color,
  2830. size_t depth = 0,
  2831. size_t len = 0) const
  2832. {
  2833. if(len == 0) len = this->len_;
  2834. assert_leq(len, this->len_ - depth);
  2835. for(size_t i = 0; i < len; i++) {
  2836. buf.append(fw ? this->cs_[depth+i] :
  2837. (color ? this->cs_[depth+len-i-1] :
  2838. compDna(this->cs_[depth+len-i-1])));
  2839. }
  2840. }
  2841. /**
  2842. * Put a terminator in the 'len_'th element and then return a
  2843. * pointer to the buffer. Useful for printing.
  2844. */
  2845. virtual const char* toZBuf() const { return this->toZBufXForm("ACGTN"); }
  2846. };
  2847. /**
  2848. * Encapsulates an expandable DNA string with characters encoded as
  2849. * char-sized masks. Encodes A, C, G, T, and all IUPAC, as well as the
  2850. * empty mask indicating "matches nothing."
  2851. */
  2852. template<int S = 16, int M = 2>
  2853. class SDnaMaskString : public SStringExpandable<char, S, M> {
  2854. public:
  2855. explicit SDnaMaskString() : SStringExpandable<char, S, M>() { }
  2856. /**
  2857. * Create an SStringFixed from another SStringFixed.
  2858. */
  2859. SDnaMaskString(const SDnaMaskString<S, M>& o) :
  2860. SStringExpandable<char, S, M>(o) { }
  2861. /**
  2862. * Create an SStringFixed from a C++ basic_string.
  2863. */
  2864. explicit SDnaMaskString(const std::basic_string<char>& str) :
  2865. SStringExpandable<char, S, M>(str) { }
  2866. /**
  2867. * Create an SStringFixed from an array and size.
  2868. */
  2869. explicit SDnaMaskString(const char* b, size_t sz) :
  2870. SStringExpandable<char, S, M>(b, sz) { }
  2871. /**
  2872. * Create an SStringFixed from a zero-terminated string.
  2873. */
  2874. explicit SDnaMaskString(const char* b, bool chars = false) :
  2875. SStringExpandable<char, S, M>()
  2876. {
  2877. if(chars) {
  2878. installChars(b, strlen(b));
  2879. } else {
  2880. install(b, strlen(b));
  2881. }
  2882. }
  2883. virtual ~SDnaMaskString() { } // C++ needs this
  2884. /**
  2885. * Copy 'sz' bytes from buffer 'b' into this string, reverse-
  2886. * complementing them in the process, assuming an encoding where
  2887. * 0=A, 1=C, 2=G, 3=T, 4=N.
  2888. */
  2889. void installReverseComp(const char* b, size_t sz) {
  2890. while(this->sz_ < sz) {
  2891. this->expandNoCopy((sz + S) * M);
  2892. }
  2893. for(size_t i = 0; i < sz; i++) {
  2894. this->cs_[i] = maskcomp[(int)b[sz-i-1]];
  2895. }
  2896. this->len_ = sz;
  2897. }
  2898. /**
  2899. * Copy 'sz' bytes from buffer 'b' into this string, reverse-
  2900. * complementing them in the process, assuming an encoding where
  2901. * 0=A, 1=C, 2=G, 3=T, 4=N.
  2902. */
  2903. void installReverseComp(const SDnaMaskString<S, M>& b) {
  2904. while(this->sz_ < b.len_) {
  2905. this->expandNoCopy((b.len_ + S) * M);
  2906. }
  2907. for(size_t i = 0; i < b.len_; i++) {
  2908. this->cs_[i] = maskcomp[(int)b.cs_[b.len_-i-1]];
  2909. }
  2910. this->len_ = b.len_;
  2911. }
  2912. /**
  2913. * Either reverse or reverse-complement (depending on "color") this
  2914. * DNA buffer in-place.
  2915. */
  2916. void reverseComp(bool color = false) {
  2917. if(color) {
  2918. this->reverse();
  2919. } else {
  2920. for(size_t i = 0; i < (this->len_ >> 1); i++) {
  2921. char tmp1 = maskcomp[(int)this->cs_[i]];
  2922. char tmp2 = maskcomp[(int)this->cs_[this->len_-i-1]];
  2923. this->cs_[i] = tmp2;
  2924. this->cs_[this->len_-i-1] = tmp1;
  2925. }
  2926. // Do middle element iff there are an odd number
  2927. if((this->len_ & 1) != 0) {
  2928. char tmp = this->cs_[this->len_ >> 1];
  2929. tmp = maskcomp[(int)tmp];
  2930. this->cs_[this->len_ >> 1] = tmp;
  2931. }
  2932. }
  2933. }
  2934. /**
  2935. * Copy 'sz' bytes from buffer 'b' into this string.
  2936. */
  2937. virtual void install(const char* b, size_t sz) {
  2938. while(this->sz_ < sz) {
  2939. this->expandNoCopy((sz + S) * M);
  2940. }
  2941. memcpy(this->cs_, b, sz);
  2942. #ifndef NDEBUG
  2943. for(size_t i = 0; i < sz; i++) {
  2944. assert_range((int)this->cs_[i], 0, 15);
  2945. }
  2946. #endif
  2947. this->len_ = sz;
  2948. }
  2949. /**
  2950. * Copy buffer 'b' of ASCII DNA characters into DNA masks.
  2951. */
  2952. virtual void installChars(const char* b, size_t sz) {
  2953. while(this->sz_ < sz) {
  2954. this->expandNoCopy((sz + S) * M);
  2955. }
  2956. for(size_t i = 0; i < sz; i++) {
  2957. assert_in(b[i], iupacs);
  2958. this->cs_[i] = asc2dnamask[(int)b[i]];
  2959. assert_range((int)this->cs_[i], 0, 15);
  2960. }
  2961. this->len_ = sz;
  2962. }
  2963. /**
  2964. * Copy C++ string of ASCII DNA characters into normal DNA
  2965. * characters.
  2966. */
  2967. virtual void installChars(const std::basic_string<char>& str) {
  2968. installChars(str.c_str(), str.length());
  2969. }
  2970. /**
  2971. * Set DNA character at index 'idx' to 'c'.
  2972. */
  2973. void set(int c, size_t idx) {
  2974. assert_lt(idx, this->len_);
  2975. assert_range(c, 0, 15);
  2976. this->cs_[idx] = c;
  2977. }
  2978. /**
  2979. * Append DNA char c.
  2980. */
  2981. void append(const char& c) {
  2982. while(this->sz_ < this->len_+1) {
  2983. this->expandNoCopy((this->len_ + 1 + S) * M);
  2984. }
  2985. assert_range((int)c, 0, 15);
  2986. this->cs_[this->len_++] = c;
  2987. }
  2988. /**
  2989. * Set DNA character at index 'idx' to 'c'.
  2990. */
  2991. void setChar(char c, size_t idx) {
  2992. assert_lt(idx, this->len_);
  2993. assert_in(toupper(c), iupacs);
  2994. this->cs_[idx] = asc2dnamask[(int)c];
  2995. }
  2996. /**
  2997. * Append DNA character.
  2998. */
  2999. void appendChar(char c) {
  3000. while(this->sz_ < this->len_+1) {
  3001. expandNoCopy((this->len_ + 1 + S) * M);
  3002. }
  3003. assert_in(toupper(c), iupacs);
  3004. this->cs_[this->len_++] = asc2dnamask[(int)c];
  3005. }
  3006. /**
  3007. * Return DNA character corresponding to element 'idx'.
  3008. */
  3009. char toChar(size_t idx) const {
  3010. assert_range((int)this->cs_[idx], 0, 15);
  3011. return mask2iupac[(int)this->cs_[idx]];
  3012. }
  3013. /**
  3014. * Retrieve constant version of element i.
  3015. */
  3016. const char& operator[](size_t i) const {
  3017. return this->get(i);
  3018. }
  3019. /**
  3020. * Retrieve mutable version of element i.
  3021. */
  3022. char& operator[](size_t i) {
  3023. return this->get(i);
  3024. }
  3025. /**
  3026. * Retrieve constant version of element i.
  3027. */
  3028. const char& get(size_t i) const {
  3029. assert_lt(i, this->len_);
  3030. assert_range((int)this->cs_[i], 0, 15);
  3031. return this->cs_[i];
  3032. }
  3033. /**
  3034. * Retrieve mutable version of element i.
  3035. */
  3036. char& get(size_t i) {
  3037. assert_lt(i, this->len_);
  3038. assert_range((int)this->cs_[i], 0, 15);
  3039. return this->cs_[i];
  3040. }
  3041. /**
  3042. * Return the ith character in the window defined by fw, color,
  3043. * depth and len.
  3044. */
  3045. char windowGetDna(
  3046. size_t i,
  3047. bool fw,
  3048. bool color,
  3049. size_t depth = 0,
  3050. size_t len = 0) const
  3051. {
  3052. if(len == 0) len = this->len_;
  3053. assert_lt(i, len);
  3054. assert_leq(len, this->len_ - depth);
  3055. if(fw) return this->cs_[depth+i];
  3056. else return color ? this->cs_[depth+len-i-1] :
  3057. maskcomp[this->cs_[depth+len-i-1]];
  3058. }
  3059. /**
  3060. * Fill the given DNA buffer with the substring specified by fw,
  3061. * color, depth and len.
  3062. */
  3063. void windowGetDna(
  3064. SDnaStringFixed<S>& buf,
  3065. bool fw,
  3066. bool color,
  3067. size_t depth = 0,
  3068. size_t len = 0) const
  3069. {
  3070. if(len == 0) len = this->len_;
  3071. assert_leq(len, this->len_ - depth);
  3072. for(size_t i = 0; i < len; i++) {
  3073. buf.append(fw ? this->cs_[depth+i] :
  3074. (color ? this->cs_[depth+len-i-1] :
  3075. maskcomp[this->cs_[depth+len-i-1]]));
  3076. }
  3077. }
  3078. /**
  3079. * Sample a random substring of the given length from this DNA
  3080. * string and install the result in 'dst'.
  3081. */
  3082. template<typename T>
  3083. void randSubstr(
  3084. RandomSource& rnd, // pseudo-random generator
  3085. T& dst, // put sampled substring here
  3086. size_t len, // length of substring to extract
  3087. bool watson = true, // true -> possibly extract from Watson strand
  3088. bool crick = true) // true -> possibly extract from Crick strand
  3089. {
  3090. assert(watson || crick);
  3091. assert_geq(this->len_, len);
  3092. size_t poss = this->len_ - len + 1;
  3093. assert_gt(poss, 0);
  3094. uint32_t rndoff = (uint32_t)(rnd.nextU32() % poss);
  3095. bool fw;
  3096. if (watson && !crick) fw = true;
  3097. else if(!watson && crick) fw = false;
  3098. else {
  3099. fw = rnd.nextBool();
  3100. }
  3101. if(fw) {
  3102. // Install Watson substring
  3103. for(size_t i = 0; i < len; i++) {
  3104. dst[i] = this->cs_[i + rndoff];
  3105. }
  3106. } else {
  3107. // Install Crick substring
  3108. for(size_t i = 0; i < len; i++) {
  3109. dst[i] = maskcomp[(int)this->cs_[i + rndoff + (len - i - 1)]];
  3110. }
  3111. }
  3112. }
  3113. /**
  3114. * Put a terminator in the 'len_'th element and then return a
  3115. * pointer to the buffer. Useful for printing.
  3116. */
  3117. virtual const char* toZBuf() const { return this->toZBufXForm(iupacs); }
  3118. };
  3119. typedef SStringExpandable<char, 1024, 2> BTString;
  3120. typedef SDnaStringExpandable<1024, 2> BTDnaString;
  3121. typedef SDnaMaskString<32, 2> BTDnaMask;
  3122. #endif /* SSTRING_H_ */