PageRenderTime 77ms CodeModel.GetById 31ms RepoModel.GetById 0ms app.codeStats 0ms

/fctypes.cpp

https://github.com/gigablast/open-source-search-engine
C++ | 2621 lines | 2290 code | 85 blank | 246 comment | 214 complexity | b041340853aa66ac00698eb8c56868e9 MD5 | raw file
Possible License(s): Apache-2.0
  1. #include "gb-include.h"
  2. #include "Loop.h"
  3. #include "Entities.h"
  4. #include "UCWordIterator.h"
  5. #include "SafeBuf.h"
  6. #include "Xml.h"
  7. #include "XmlNode.h"
  8. #include "iana_charset.h"
  9. static bool g_clockInSync = false;
  10. bool g_clockNeedsUpdate = true;
  11. bool isClockInSync() {
  12. if ( g_hostdb.m_initialized && g_hostdb.m_hostId == 0 ) return true;
  13. return g_clockInSync;
  14. }
  15. bool print96 ( char *k ) {
  16. key_t *kp = (key_t *)k;
  17. printf("n1=0x%"XINT32" n0=0x%"XINT64"\n",(int32_t)kp->n1,(int64_t)kp->n0);
  18. return true;
  19. }
  20. bool print96 ( key_t *kp ) {
  21. printf("n1=0x%"XINT32" n0=0x%"XINT64"\n",(int32_t)kp->n1,(int64_t)kp->n0);
  22. return true;
  23. }
  24. bool print128 ( char *k ) {
  25. key128_t *kp = (key128_t *)k;
  26. printf("n1=0x%"XINT64" n0=0x%"XINT64"\n",(int64_t)kp->n1,(int64_t)kp->n0);
  27. return true;
  28. }
  29. bool print128 ( key128_t *kp ) {
  30. printf("n1=0x%"XINT64" n0=0x%"XINT64"\n",(int64_t)kp->n1,(int64_t)kp->n0);
  31. return true;
  32. }
  33. // . put all the maps here now
  34. // . convert "c" to lower case
  35. const unsigned char g_map_to_lower[] = {
  36. 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ,
  37. 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ,
  38. 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 ,
  39. 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ,
  40. 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 ,
  41. 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 ,
  42. 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 ,
  43. 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 ,
  44. 64 , 'a','b' ,'c' ,'d' ,'e' ,'f' ,'g' ,
  45. 'h', 'i','j' ,'k' ,'l' ,'m' ,'n' ,'o' ,
  46. 'p', 'q','r' ,'s' ,'t' ,'u' ,'v' ,'w' ,
  47. 'x', 'y','z' , 91 , 92 ,93 ,94 ,95 ,
  48. 96 , 'a','b' ,'c' ,'d' ,'e' ,'f' ,'g' ,
  49. 'h', 'i','j' ,'k' ,'l' ,'m' ,'n' ,'o' ,
  50. 'p', 'q','r' ,'s' ,'t' ,'u' ,'v' ,'w' ,
  51. 'x', 'y','z' ,123 ,124 ,125 ,126 ,127 ,
  52. 128,129,130,131,132,133,134,135,
  53. 136,137,138,139,140,141,142,143,
  54. 144,145,146,147,148,149,150,151,
  55. 152,153,154,155,156,157,158,159,
  56. 160,161,162,163,164,165,166,167,
  57. 168,169,170,171,172,173,174,175,
  58. 176,177,178,179,180,181,182,183,
  59. 184,185,186,187,188,189,190,191,
  60. 224,225,226,227,228,229,230,231,
  61. 232,233,234,235,236,237,238,239,
  62. 240,241,242,243,244,245,246,247,
  63. 248,249,250,251,252,253,254,223,
  64. 224,225,226,227,228,229,230,231,
  65. 232,233,234,235,236,237,238,239,
  66. 240,241,242,243,244,245,246,247,
  67. 248,249,250,251,252,253,254,255
  68. };
  69. // converts ascii chars and IS_O chars to their lower case versions
  70. const unsigned char g_map_to_upper[] = {
  71. 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ,
  72. 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ,
  73. 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 ,
  74. 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ,
  75. 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 ,
  76. 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 ,
  77. 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 ,
  78. 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 ,
  79. 64 , 'A','B' ,'C' ,'D' ,'E' ,'F' ,'G' ,
  80. 'H', 'I','J' ,'K' ,'L' ,'M' ,'N' ,'O' ,
  81. 'P', 'Q','R' ,'S' ,'T' ,'U' ,'V' ,'W' ,
  82. 'X', 'Y','Z' , 91 , 92 ,93 ,94 ,95 ,
  83. 96 , 'A','B' ,'C' ,'D' ,'E' ,'F' ,'G' ,
  84. 'H', 'I','J' ,'K' ,'L' ,'M' ,'N' ,'O' ,
  85. 'P', 'Q','R' ,'S' ,'T' ,'U' ,'V' ,'W' ,
  86. 'X', 'Y','Z' ,123 ,124 ,125 ,126 ,127 ,
  87. 128,129,130,131,132,133,134,135,
  88. 136,137,138,139,140,141,142,143,
  89. 144,145,146,147,148,149,150,151,
  90. 152,153,154,155,156,157,158,159,
  91. 160,161,162,163,164,165,166,167,
  92. 168,169,170,171,172,173,174,175,
  93. 176,177,178,179,180,181,182,183,
  94. 184,185,186,187,188,189,190,191,
  95. 192,193,194,195,196,197,198,199,
  96. 200,201,202,203,204,205,206,207,
  97. 208,209,210,211,212,213,214,215,
  98. 216,217,218,219,220,221,222,223,
  99. 192,193,194,195,196,197,198,199,
  100. 200,201,202,203,204,205,206,207,
  101. 208,209,210,211,212,213,214,215,
  102. 216,217,218,219,220,221,222,255
  103. };
  104. const unsigned char g_map_to_ascii[] = {
  105. 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ,
  106. 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ,
  107. 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 ,
  108. 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ,
  109. 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 ,
  110. 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 ,
  111. 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 ,
  112. 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 ,
  113. 64 , 'A','B' ,'C' ,'D' ,'E' ,'F' ,'G' ,
  114. 'H', 'I','J' ,'K' ,'L' ,'M' ,'N' ,'O' ,
  115. 'P', 'Q','R' ,'S' ,'T' ,'U' ,'V' ,'W' ,
  116. 'X', 'Y','Z' , 91 , 92 ,93 ,94 ,95 ,
  117. 96 , 'a','b' ,'c' ,'d' ,'e' ,'f' ,'g' ,
  118. 'h', 'i','j' ,'k' ,'l' ,'m' ,'n' ,'o' ,
  119. 'p', 'q','r' ,'s' ,'t' ,'u' ,'v' ,'w' ,
  120. 'x', 'y','z' ,123 ,124 ,125 ,126 ,127 ,
  121. 128,129,130,131, 132,133,134,135,
  122. 136,137,138,139, 140,141,142,143,
  123. 144,145,146,147, 148,149,150,151,
  124. 152,153,154,155, 156,157,158,159,
  125. 160,161,162,'#', 'o','Y','|','S',
  126. 168,169,'a',171, 172,173,174,175,
  127. 176,177,'2','3', 180,'u',182,183,
  128. ' ','1','o',187, 188,189,190,'?',
  129. 'A','A','A','A', 'A','A','A'/*198-AE*/,'C',
  130. 'E','E','E','E', 'I','I','I','I',
  131. 'D','N','O','O', 'O','O','O','x',
  132. 'O','U','U','U', 'U','Y',222/*TH*/,'s'/*changed from B*/,
  133. 'a','a','a','a', 'a','a','a'/*230-ae*/,'c',
  134. 'e','e','e','e', 'i','i','i','i',
  135. 'd','n','o','o', 'o','o','o','/',
  136. 'o','u','u','u', 'u','y',254/*th*/,'y'
  137. };
  138. const char g_map_is_upper[] = {
  139. 0,0,0,0,0,0,0,0, // 0 -7
  140. 0,0,0,0,0,0,0,0,
  141. 0,0,0,0,0,0,0,0,
  142. 0,0,0,0,0,0,0,0,
  143. 0,0,0,0,0,0,0,0,
  144. 0,0,0,0,0,0,0,0,
  145. 0,0,0,0,0,0,0,0,
  146. 0,0,0,0,0,0,0,0,
  147. 0,1,1,1,1,1,1,1, // 64
  148. 1,1,1,1,1,1,1,1,
  149. 1,1,1,1,1,1,1,1,
  150. 1,1,1,0,0,0,0,0, // 88
  151. 0,0,0,0,0,0,0,0,
  152. 0,0,0,0,0,0,0,0,
  153. 0,0,0,0,0,0,0,0,
  154. 0,0,0,0,0,0,0,0, // 15*8
  155. 0,0,0,0,0,0,0,0,
  156. 0,0,0,0,0,0,0,0,
  157. 0,0,0,0,0,0,0,0,
  158. 0,0,0,0,0,0,0,0,
  159. 0,0,0,0,0,0,0,0, // 20*8
  160. 0,0,0,0,0,0,0,0,
  161. 0,0,0,0,0,0,0,0,
  162. 0,0,0,0,0,0,0,0,
  163. 1,1,1,1,1,1,1,1, // 192
  164. 1,1,1,1,1,1,1,1, // 200
  165. 1,1,1,1,1,1,1,0, // 208
  166. 1,1,1,1,1,1,1,1, // 216
  167. 0,0,0,0,0,0,0,0, // 224
  168. 0,0,0,0,0,0,0,0, // 232
  169. 0,0,0,0,0,0,0,0, // 240
  170. 0,0,0,0,0,0,0,0}; // 248
  171. // can this character be in an html (or xml) tag name??
  172. const char g_map_canBeInTagName[] = {
  173. 0,0,0,0,0,0,0,0, // 0 -7
  174. 0,0,0,0,0,0,0,0,
  175. 0,0,0,0,0,0,0,0,
  176. 0,0,0,0,0,0,0,0,
  177. 0,0,0,0,0,0,0,0,
  178. 0,0,0,0,0,1,0,0, // 40 -- hyphen can be in tag name
  179. 0,0,0,0,0,0,0,0, // 48
  180. 0,0,0,0,0,0,0,0, // 56
  181. 0,1,1,1,1,1,1,1, // 64
  182. 1,1,1,1,1,1,1,1,
  183. 1,1,1,1,1,1,1,1,
  184. 1,1,1,0,0,0,0,0, // 88
  185. 0,1,1,1,1,1,1,1, // 96
  186. 1,1,1,1,1,1,1,1, // 104
  187. 1,1,1,1,1,1,1,1, // 112
  188. 1,1,1,0,0,0,0,0, // 15*8 = 120
  189. 0,0,0,0,0,0,0,0,
  190. 0,0,0,0,0,0,0,0,
  191. 0,0,0,0,0,0,0,0,
  192. 0,0,0,0,0,0,0,0,
  193. 0,0,0,0,0,0,0,0, // 20*8 = 160
  194. 0,0,0,0,0,0,0,0, // 168
  195. 0,0,0,0,0,0,0,0, // 176
  196. 0,0,0,0,0,0,0,0, // 184
  197. 0,0,0,0,0,0,0,0, // 192
  198. 0,0,0,0,0,0,0,0, // 200
  199. 0,0,0,0,0,0,0,0, // 208
  200. 0,0,0,0,0,0,0,0, // 216
  201. 0,0,0,0,0,0,0,0, // 224
  202. 0,0,0,0,0,0,0,0, // 232
  203. 0,0,0,0,0,0,0,0, // 240
  204. 0,0,0,0,0,0,0,0}; // 248
  205. const char g_map_is_control [] = {
  206. 1,1,1,1,1,1,1,1,
  207. 1,1,1,1,1,1,1,1,
  208. 1,1,1,1,1,1,1,1,
  209. 1,1,1,1,1,1,1,1,
  210. 0,0,0,0,0,0,0,0,
  211. 0,0,0,0,0,0,0,0,
  212. 0,0,0,0,0,0,0,0,
  213. 0,0,0,0,0,0,0,0,
  214. 0,0,0,0,0,0,0,0, // 64
  215. 0,0,0,0,0,0,0,0,
  216. 0,0,0,0,0,0,0,0,
  217. 0,0,0,0,0,0,0,0,
  218. 0,0,0,0,0,0,0,0, // 96
  219. 0,0,0,0,0,0,0,0,
  220. 0,0,0,0,0,0,0,0,
  221. 0,0,0,0,0,0,0,1, // 120, 127 = DEL
  222. 1,1,1,1,1,1,1,1, // 128
  223. 1,1,1,1,1,1,1,1,
  224. 1,1,1,1,1,1,1,1,
  225. 1,1,1,1,1,1,1,1,
  226. 1,0,0,0,0,0,0,0, // 160 = backspace
  227. 0,0,0,0,0,0,0,0,
  228. 0,0,0,0,0,0,0,0,
  229. 0,0,0,0,0,0,0,0,
  230. 0,0,0,0,0,0,0,0, // 192
  231. 0,0,0,0,0,0,0,0,
  232. 0,0,0,0,0,0,0,0,
  233. 0,0,0,0,0,0,0,0,
  234. 0,0,0,0,0,0,0,0, // 224
  235. 0,0,0,0,0,0,0,0, // 232
  236. 0,0,0,0,0,0,0,0, // 240
  237. 0,0,0,0,0,0,0,0}; // 248
  238. // people mix windows 1252 into latin-1 so we have to be less restrictive here...
  239. const char g_map_is_binary[] = {
  240. 1,1,1,1,1,1,1,1,
  241. 1,0,0,1,1,0,1,1, // \t=9 \n = 10 \r = 13
  242. 1,1,1,1,1,1,1,1,
  243. 1,1,1,1,1,1,1,1,
  244. 0,0,0,0,0,0,0,0,
  245. 0,0,0,0,0,0,0,0,
  246. 0,0,0,0,0,0,0,0,
  247. 0,0,0,0,0,0,0,0,
  248. 0,0,0,0,0,0,0,0, // 64
  249. 0,0,0,0,0,0,0,0,
  250. 0,0,0,0,0,0,0,0,
  251. 0,0,0,0,0,0,0,0,
  252. 0,0,0,0,0,0,0,0, // 96
  253. 0,0,0,0,0,0,0,0,
  254. 0,0,0,0,0,0,0,0,
  255. 0,0,0,0,0,0,0,1, // 120, 127 = DEL
  256. 0,1,1,1,1,1,1,1, // 128 (128 is a quote)
  257. 1,1,1,1,1,1,1,1, // 136
  258. 1,0,0,0,0,0,0,1, // 144 (145 146 147 are quotes, 148 is dash, 149 bullet,150 dash)
  259. 0,0,1,1,0,0,1,1, // 152 (152 & 153 are quotes, 156 & 157 are double quotes)
  260. 0,0,0,0,0,0,0,0, // 160 = backspace (some urls have this???)
  261. 0,0,0,0,0,0,0,0,
  262. 0,0,0,0,0,0,0,0,
  263. 0,0,0,0,0,0,0,0,
  264. 0,0,0,0,0,0,0,0, // 192
  265. 0,0,0,0,0,0,0,0,
  266. 0,0,0,0,0,0,0,0,
  267. 0,0,0,0,0,0,0,0,
  268. 0,0,0,0,0,0,0,0, // 224
  269. 0,0,0,0,0,0,0,0, // 232
  270. 0,0,0,0,0,0,0,0, // 240
  271. 0,0,0,0,0,0,0,0}; // 248
  272. // ' ' '\n' '\t' '\r'
  273. const char g_map_is_wspace[] = {
  274. 0,0,0,0,0,0,0,0, // 0 -7
  275. 0,1,1,0,0,1,0,0, // \t=9 \n = 10 \r = 13
  276. 0,0,0,0,0,0,0,0,
  277. 0,0,0,0,0,0,0,0,
  278. 1,0,0,0,0,0,0,0, // space=32
  279. 0,0,0,0,0,0,0,0,
  280. 0,0,0,0,0,0,0,0,
  281. 0,0,0,0,0,0,0,0,
  282. 0,0,0,0,0,0,0,0, // 64
  283. 0,0,0,0,0,0,0,0,
  284. 0,0,0,0,0,0,0,0,
  285. 0,0,0,0,0,0,0,0, // 88
  286. 0,0,0,0,0,0,0,0,
  287. 0,0,0,0,0,0,0,0,
  288. 0,0,0,0,0,0,0,0,
  289. 0,0,0,0,0,0,0,0, // 15*8
  290. 0,0,0,0,0,0,0,0,
  291. 0,0,0,0,0,0,0,0,
  292. 0,0,0,0,0,0,0,0,
  293. 0,0,0,0,0,0,0,0,
  294. 0,0,0,0,0,0,0,0, // 160 -- turn off 160, it might be utf8 byte
  295. 0,0,0,0,0,0,0,0, // 168
  296. 0,0,0,0,0,0,0,0,
  297. 0,0,0,0,0,0,0,0,
  298. 0,0,0,0,0,0,0,0, // 192
  299. 0,0,0,0,0,0,0,0, // 200
  300. 0,0,0,0,0,0,0,0, // 208
  301. 0,0,0,0,0,0,0,0, // 216
  302. 0,0,0,0,0,0,0,0, // 224
  303. 0,0,0,0,0,0,0,0, // 232
  304. 0,0,0,0,0,0,0,0, // 240
  305. 0,0,0,0,0,0,0,0}; // 248
  306. // '\n'
  307. const char g_map_is_vspace[] = {
  308. 0,0,0,0,0,0,0,0, // 0 -7
  309. 0,0,1,0,0,0,0,0, // \t=9 \n = 10
  310. 0,0,0,0,0,0,0,0,
  311. 0,0,0,0,0,0,0,0,
  312. 0,0,0,0,0,0,0,0, // space=32
  313. 0,0,0,0,0,0,0,0,
  314. 0,0,0,0,0,0,0,0,
  315. 0,0,0,0,0,0,0,0,
  316. 0,0,0,0,0,0,0,0, // 64
  317. 0,0,0,0,0,0,0,0,
  318. 0,0,0,0,0,0,0,0,
  319. 0,0,0,0,0,0,0,0, // 88
  320. 0,0,0,0,0,0,0,0,
  321. 0,0,0,0,0,0,0,0,
  322. 0,0,0,0,0,0,0,0,
  323. 0,0,0,0,0,0,0,0, // 15*8
  324. 0,0,0,0,0,0,0,0,
  325. 0,0,0,0,0,0,0,0,
  326. 0,0,0,0,0,0,0,0,
  327. 0,0,0,0,0,0,0,0,
  328. 0,0,0,0,0,0,0,0, // 20*8
  329. 0,0,0,0,0,0,0,0,
  330. 0,0,0,0,0,0,0,0,
  331. 0,0,0,0,0,0,0,0,
  332. 1,1,1,1,1,1,1,1, // 192
  333. 1,1,1,1,1,1,1,1, // 200
  334. 1,1,1,1,1,1,1,0, // 208
  335. 1,1,1,1,1,1,1,1, // 216
  336. 0,0,0,0,0,0,0,0, // 224
  337. 0,0,0,0,0,0,0,0, // 232
  338. 0,0,0,0,0,0,0,0, // 240
  339. 0,0,0,0,0,0,0,0}; // 248
  340. // ' ' '\t'
  341. const char g_map_is_hspace[] = {
  342. 0,0,0,0,0,0,0,0, // 0 -7
  343. 0,1,0,0,0,0,0,0, // \t=9 \n = 10
  344. 0,0,0,0,0,0,0,0,
  345. 0,0,0,0,0,0,0,0,
  346. 1,0,0,0,0,0,0,0, // space=32
  347. 0,0,0,0,0,0,0,0,
  348. 0,0,0,0,0,0,0,0,
  349. 0,0,0,0,0,0,0,0,
  350. 0,0,0,0,0,0,0,0, // 64
  351. 0,0,0,0,0,0,0,0,
  352. 0,0,0,0,0,0,0,0,
  353. 0,0,0,0,0,0,0,0, // 88
  354. 0,0,0,0,0,0,0,0,
  355. 0,0,0,0,0,0,0,0,
  356. 0,0,0,0,0,0,0,0,
  357. 0,0,0,0,0,0,0,0, // 15*8
  358. 0,0,0,0,0,0,0,0,
  359. 0,0,0,0,0,0,0,0,
  360. 0,0,0,0,0,0,0,0,
  361. 0,0,0,0,0,0,0,0,
  362. 0,0,0,0,0,0,0,0, // 20*8
  363. 0,0,0,0,0,0,0,0,
  364. 0,0,0,0,0,0,0,0,
  365. 0,0,0,0,0,0,0,0,
  366. 1,1,1,1,1,1,1,1, // 192
  367. 1,1,1,1,1,1,1,1, // 200
  368. 1,1,1,1,1,1,1,0, // 208
  369. 1,1,1,1,1,1,1,1, // 216
  370. 0,0,0,0,0,0,0,0, // 224
  371. 0,0,0,0,0,0,0,0, // 232
  372. 0,0,0,0,0,0,0,0, // 240
  373. 0,0,0,0,0,0,0,0}; // 248
  374. const char g_map_is_vowel[] = {
  375. 0,0,0,0,0,0,0,0, // 0 -7
  376. 0,0,0,0,0,0,0,0, // 8-15
  377. 0,0,0,0,0,0,0,0, // 16-
  378. 0,0,0,0,0,0,0,0, // 24-
  379. 0,0,0,0,0,0,0,0, // 32-
  380. 0,0,0,0,0,0,0,0, // 40-
  381. 0,0,0,0,0,0,0,0, // 48-
  382. 0,0,0,0,0,0,0,0, // 56-
  383. 0,1,0,0,0,1,0,0, // 64 (A=65)
  384. 0,1,0,0,0,0,0,1, // 72
  385. 0,0,0,0,0,1,0,0, // 80
  386. 0,0,0,0,0,0,0,0, // 88-
  387. 0,1,0,0,0,1,0,0, // 96- (a=97)
  388. 0,1,0,0,0,0,0,1,
  389. 0,0,0,0,0,1,0,0,
  390. 0,0,0,0,0,0,0,0,
  391. 0,0,0,0,0,0,0,0, // 128
  392. 0,0,0,0,0,0,0,0,
  393. 0,0,0,0,0,0,0,0,
  394. 0,0,0,0,0,0,0,0,
  395. 0,0,0,0,0,0,0,0, // 160
  396. 0,0,0,0,0,0,0,0,
  397. 0,0,0,0,0,0,0,0,
  398. 0,0,0,0,0,0,0,0,
  399. 0,0,0,0,0,0,0,0, // 192
  400. 0,0,0,0,0,0,0,0,
  401. 0,0,0,0,0,0,0,0,
  402. 0,0,0,0,0,0,0,0,
  403. 0,0,0,0,0,0,0,0,
  404. 0,0,0,0,0,0,0,0,
  405. 0,0,0,0,0,0,0,0,
  406. 0,0,0,0,0,0,0,0};
  407. // converts ascii chars and IS_O chars to their lower case versions
  408. const char g_map_is_lower[] = { // 97-122 and 224-255 (excluding 247)
  409. 0,0,0,0,0,0,0,0, // 0 -7
  410. 0,0,0,0,0,0,0,0,
  411. 0,0,0,0,0,0,0,0,
  412. 0,0,0,0,0,0,0,0,
  413. 0,0,0,0,0,0,0,0,
  414. 0,0,0,0,0,0,0,0,
  415. 0,0,0,0,0,0,0,0,
  416. 0,0,0,0,0,0,0,0,
  417. 0,0,0,0,0,0,0,0, // 64
  418. 0,0,0,0,0,0,0,0,
  419. 0,0,0,0,0,0,0,0,
  420. 0,0,0,0,0,0,0,0,
  421. 0,1,1,1,1,1,1,1, // 96
  422. 1,1,1,1,1,1,1,1,
  423. 1,1,1,1,1,1,1,1,
  424. 1,1,1,0,0,0,0,0, // 120
  425. 0,0,0,0,0,0,0,0, // 128
  426. 0,0,0,0,0,0,0,0,
  427. 0,0,0,0,0,0,0,0,
  428. 0,0,0,0,0,0,0,0,
  429. 0,0,0,0,0,0,0,0, // 160
  430. 0,0,0,0,0,0,0,0,
  431. 0,0,0,0,0,0,0,0,
  432. 0,0,0,0,0,0,0,0,
  433. 0,0,0,0,0,0,0,0, // 192
  434. 0,0,0,0,0,0,0,0,
  435. 0,0,0,0,0,0,0,0,
  436. 0,0,0,0,0,0,0,0,
  437. 1,1,1,1,1,1,1,1, // 224
  438. 1,1,1,1,1,1,1,1, // 232
  439. 1,1,1,1,1,1,1,0, // 240
  440. 1,1,1,1,1,1,1,1}; // 248
  441. const char g_map_is_ascii[] = { // 32 to 126
  442. 0,0,0,0,0,0,0,0, // 0
  443. 0,0,0,0,0,0,0,0, // 8
  444. 0,0,0,0,0,0,0,0,
  445. 0,0,0,0,0,0,0,0,
  446. 1,1,1,1,1,1,1,1, // 32
  447. 1,1,1,1,1,1,1,1,
  448. 1,1,1,1,1,1,1,1,
  449. 1,1,1,1,1,1,1,1,
  450. 1,1,1,1,1,1,1,1,
  451. 1,1,1,1,1,1,1,1,
  452. 1,1,1,1,1,1,1,1,
  453. 1,1,1,1,1,1,1,1,
  454. 1,1,1,1,1,1,1,1,
  455. 1,1,1,1,1,1,1,1,
  456. 1,1,1,1,1,1,1,1,
  457. 1,1,1,1,1,1,1,0,
  458. 0,0,0,0,0,0,0,0, // 128
  459. 0,0,0,0,0,0,0,0,
  460. 0,0,0,0,0,0,0,0,
  461. 0,0,0,0,0,0,0,0,
  462. 0,0,0,0,0,0,0,0,
  463. 0,0,0,0,0,0,0,0,
  464. 0,0,0,0,0,0,0,0,
  465. 0,0,0,0,0,0,0,0,
  466. 0,0,0,0,0,0,0,0,
  467. 0,0,0,0,0,0,0,0,
  468. 0,0,0,0,0,0,0,0,
  469. 0,0,0,0,0,0,0,0,
  470. 0,0,0,0,0,0,0,0,
  471. 0,0,0,0,0,0,0,0,
  472. 0,0,0,0,0,0,0,0,
  473. 0,0,0,0,0,0,0,0};
  474. // just from 0-127, used by the inlined *_utf8() functions in fctypes.h
  475. const char g_map_is_ascii3[] = { // 32 to 126
  476. 1,1,1,1,1,1,1,1,
  477. 1,1,1,1,1,1,1,1,
  478. 1,1,1,1,1,1,1,1,
  479. 1,1,1,1,1,1,1,1,
  480. 1,1,1,1,1,1,1,1, // 32
  481. 1,1,1,1,1,1,1,1,
  482. 1,1,1,1,1,1,1,1,
  483. 1,1,1,1,1,1,1,1,
  484. 1,1,1,1,1,1,1,1,
  485. 1,1,1,1,1,1,1,1,
  486. 1,1,1,1,1,1,1,1,
  487. 1,1,1,1,1,1,1,1,
  488. 1,1,1,1,1,1,1,1,
  489. 1,1,1,1,1,1,1,1,
  490. 1,1,1,1,1,1,1,1,
  491. 1,1,1,1,1,1,1,1,
  492. 0,0,0,0,0,0,0,0, // 128
  493. 0,0,0,0,0,0,0,0,
  494. 0,0,0,0,0,0,0,0,
  495. 0,0,0,0,0,0,0,0,
  496. 0,0,0,0,0,0,0,0,
  497. 0,0,0,0,0,0,0,0,
  498. 0,0,0,0,0,0,0,0,
  499. 0,0,0,0,0,0,0,0,
  500. 0,0,0,0,0,0,0,0,
  501. 0,0,0,0,0,0,0,0,
  502. 0,0,0,0,0,0,0,0,
  503. 0,0,0,0,0,0,0,0,
  504. 0,0,0,0,0,0,0,0,
  505. 0,0,0,0,0,0,0,0,
  506. 0,0,0,0,0,0,0,0,
  507. 0,0,0,0,0,0,0,0};
  508. const char g_map_is_iso[] = { // 32 to 126
  509. 0,0,0,0,0,0,0,0, // 0
  510. 0,0,0,0,0,0,0,0, // 8
  511. 0,0,0,0,0,0,0,0,
  512. 0,0,0,0,0,0,0,0,
  513. 0,0,0,0,0,0,0,0, // 32
  514. 0,0,0,0,0,0,0,0,
  515. 0,0,0,0,0,0,0,0,
  516. 0,0,0,0,0,0,0,0,
  517. 0,0,0,0,0,0,0,0,
  518. 0,0,0,0,0,0,0,0,
  519. 0,0,0,0,0,0,0,0,
  520. 0,0,0,0,0,0,0,0,
  521. 0,0,0,0,0,0,0,0,
  522. 0,0,0,0,0,0,0,0,
  523. 0,0,0,0,0,0,0,0,
  524. 0,0,0,0,0,0,0,0,
  525. 0,0,0,0,0,0,0,0, // 128
  526. 0,0,0,0,0,0,0,0,
  527. 0,0,0,0,0,0,0,0,
  528. 0,0,0,0,0,0,0,0, // 152
  529. 0,1,1,1,1,1,1,1, // 160
  530. 1,1,1,1,1,1,1,1,
  531. 1,1,1,1,1,1,1,1,
  532. 1,1,1,1,1,1,1,1,
  533. 1,1,1,1,1,1,1,1,
  534. 1,1,1,1,1,1,1,1,
  535. 1,1,1,1,1,1,1,1,
  536. 1,1,1,1,1,1,1,1,
  537. 1,1,1,1,1,1,1,1,
  538. 1,1,1,1,1,1,1,1,
  539. 1,1,1,1,1,1,1,1,
  540. 1,1,1,1,1,1,1,1};
  541. const char g_map_is_punct[] = { // 33-47, 58-64, 91-96, 123-126, 161-191, 215,247
  542. 0,0,0,0,0,0,0,0, // 0
  543. 0,0,0,0,0,0,0,0,
  544. 0,0,0,0,0,0,0,0, // 16
  545. 0,0,0,0,0,0,0,0,
  546. 0,1,1,1,1,1,1,1, // 32
  547. 1,1,1,1,1,1,1,1,
  548. 0,0,0,0,0,0,0,0, // 48
  549. 0,0,1,1,1,1,1,1,
  550. 1,0,0,0,0,0,0,0, // 64
  551. 0,0,0,0,0,0,0,0,
  552. 0,0,0,0,0,0,0,0, // 80
  553. 0,0,0,1,1,1,1,1,
  554. 1,0,0,0,0,0,0,0, // 96
  555. 0,0,0,0,0,0,0,0,
  556. 0,0,0,0,0,0,0,0, // 112
  557. 0,0,0,1,1,1,1,0,
  558. 0,0,0,0,0,0,0,0, // 128
  559. 0,0,0,0,0,0,0,0,
  560. 0,0,0,0,0,0,0,0, // 144
  561. 0,0,0,0,0,0,0,0,
  562. 0,1,1,1,1,1,1,1, // 160
  563. 1,1,1,1,1,1,1,1,
  564. 1,1,1,1,1,1,1,1, // 176
  565. 1,1,1,1,1,1,1,1,
  566. 0,0,0,0,0,0,0,0, // 192
  567. 0,0,0,0,0,0,0,0,
  568. 0,0,0,0,0,0,0,1, // 208
  569. 0,0,0,0,0,0,0,0,
  570. 0,0,0,0,0,0,0,0, // 224
  571. 0,0,0,0,0,0,0,0,
  572. 0,0,0,0,0,0,0,1, // 240
  573. 0,0,0,0,0,0,0,0}; // 248
  574. const char g_map_is_alnum[] = { // 48-57, 65-90,97-122,192-255(excluding 215,247)
  575. 0,0,0,0,0,0,0,0, // 0
  576. 0,0,0,0,0,0,0,0,
  577. 0,0,0,0,0,0,0,0, // 16
  578. 0,0,0,0,0,0,0,0,
  579. 0,0,0,0,0,0,0,0, // 32
  580. 0,0,0,0,0,0,0,0,
  581. 1,1,1,1,1,1,1,1, // 48
  582. 1,1,0,0,0,0,0,0,
  583. 0,1,1,1,1,1,1,1, // 64
  584. 1,1,1,1,1,1,1,1,
  585. 1,1,1,1,1,1,1,1, // 80
  586. 1,1,1,0,0,0,0,0,
  587. 0,1,1,1,1,1,1,1, // 96
  588. 1,1,1,1,1,1,1,1,
  589. 1,1,1,1,1,1,1,1, // 112
  590. 1,1,1,0,0,0,0,0,
  591. 0,0,0,0,0,0,0,0, // 128
  592. 0,0,0,0,0,0,0,0,
  593. 0,0,0,0,0,0,0,0, // 144
  594. 0,0,0,0,0,0,0,0,
  595. 0,0,0,0,0,0,0,0, // 160
  596. 0,0,0,0,0,0,0,0,
  597. 0,0,0,0,0,0,0,0, // 176
  598. 0,0,0,0,0,0,0,0,
  599. 1,1,1,1,1,1,1,1, // 192
  600. 1,1,1,1,1,1,1,1,
  601. 1,1,1,1,1,1,1,0, // 208
  602. 1,1,1,1,1,1,1,1,
  603. 1,1,1,1,1,1,1,1, // 224
  604. 1,1,1,1,1,1,1,1,
  605. 1,1,1,1,1,1,1,0, // 240
  606. 1,1,1,1,1,1,1,1};
  607. const char g_map_is_alpha[] = { // 65-90, 97-122, 192-255 (excluding 215, 247)
  608. 0,0,0,0,0,0,0,0, // 0
  609. 0,0,0,0,0,0,0,0,
  610. 0,0,0,0,0,0,0,0, // 16
  611. 0,0,0,0,0,0,0,0,
  612. 0,0,0,0,0,0,0,0, // 32
  613. 0,0,0,0,0,0,0,0,
  614. 0,0,0,0,0,0,0,0, // 48
  615. 0,0,0,0,0,0,0,0,
  616. 0,1,1,1,1,1,1,1, // 64
  617. 1,1,1,1,1,1,1,1,
  618. 1,1,1,1,1,1,1,1, // 80
  619. 1,1,1,0,0,0,0,0,
  620. 0,1,1,1,1,1,1,1, // 96
  621. 1,1,1,1,1,1,1,1,
  622. 1,1,1,1,1,1,1,1, // 112
  623. 1,1,1,0,0,0,0,0,
  624. 0,0,0,0,0,0,0,0, // 128
  625. 0,0,0,0,0,0,0,0,
  626. 0,0,0,0,0,0,0,0, // 144
  627. 0,0,0,0,0,0,0,0,
  628. 0,0,0,0,0,0,0,0, // 160
  629. 0,0,0,0,0,0,0,0,
  630. 0,0,0,0,0,0,0,0, // 176
  631. 0,0,0,0,0,0,0,0,
  632. 1,1,1,1,1,1,1,1, // 192
  633. 1,1,1,1,1,1,1,1,
  634. 1,1,1,1,1,1,1,0, // 208
  635. 1,1,1,1,1,1,1,1,
  636. 1,1,1,1,1,1,1,1, // 224
  637. 1,1,1,1,1,1,1,1,
  638. 1,1,1,1,1,1,1,0, // 240
  639. 1,1,1,1,1,1,1,1};
  640. const char g_map_is_digit[] = { // 48-57
  641. 0,0,0,0,0,0,0,0, // 0
  642. 0,0,0,0,0,0,0,0,
  643. 0,0,0,0,0,0,0,0, // 16
  644. 0,0,0,0,0,0,0,0,
  645. 0,0,0,0,0,0,0,0, // 32
  646. 0,0,0,0,0,0,0,0,
  647. 1,1,1,1,1,1,1,1, // 48
  648. 1,1,0,0,0,0,0,0,
  649. 0,0,0,0,0,0,0,0,
  650. 0,0,0,0,0,0,0,0,
  651. 0,0,0,0,0,0,0,0,
  652. 0,0,0,0,0,0,0,0,
  653. 0,0,0,0,0,0,0,0,
  654. 0,0,0,0,0,0,0,0,
  655. 0,0,0,0,0,0,0,0,
  656. 0,0,0,0,0,0,0,0,
  657. 0,0,0,0,0,0,0,0,
  658. 0,0,0,0,0,0,0,0,
  659. 0,0,0,0,0,0,0,0,
  660. 0,0,0,0,0,0,0,0,
  661. 0,0,0,0,0,0,0,0,
  662. 0,0,0,0,0,0,0,0,
  663. 0,0,0,0,0,0,0,0,
  664. 0,0,0,0,0,0,0,0,
  665. 0,0,0,0,0,0,0,0,
  666. 0,0,0,0,0,0,0,0,
  667. 0,0,0,0,0,0,0,0,
  668. 0,0,0,0,0,0,0,0,
  669. 0,0,0,0,0,0,0,0,
  670. 0,0,0,0,0,0,0,0,
  671. 0,0,0,0,0,0,0,0,
  672. 0,0,0,0,0,0,0,0};
  673. const char g_map_is_hex[] = { // 48-57
  674. 0,0,0,0,0,0,0,0, // 0
  675. 0,0,0,0,0,0,0,0,
  676. 0,0,0,0,0,0,0,0, // 16
  677. 0,0,0,0,0,0,0,0,
  678. 0,0,0,0,0,0,0,0, // 32
  679. 0,0,0,0,0,0,0,0,
  680. 1,1,1,1,1,1,1,1, // 48
  681. 1,1,0,0,0,0,0,0, // 56
  682. 0,1,1,1,1,1,1,0, // 64 (65='A')
  683. 0,0,0,0,0,0,0,0, // 72
  684. 0,0,0,0,0,0,0,0, // 80
  685. 0,0,0,0,0,0,0,0, // 88
  686. 0,1,1,1,1,1,1,0, // 96 (97='a')
  687. 0,0,0,0,0,0,0,0,
  688. 0,0,0,0,0,0,0,0,
  689. 0,0,0,0,0,0,0,0,
  690. 0,0,0,0,0,0,0,0,
  691. 0,0,0,0,0,0,0,0,
  692. 0,0,0,0,0,0,0,0,
  693. 0,0,0,0,0,0,0,0,
  694. 0,0,0,0,0,0,0,0,
  695. 0,0,0,0,0,0,0,0,
  696. 0,0,0,0,0,0,0,0,
  697. 0,0,0,0,0,0,0,0,
  698. 0,0,0,0,0,0,0,0,
  699. 0,0,0,0,0,0,0,0,
  700. 0,0,0,0,0,0,0,0,
  701. 0,0,0,0,0,0,0,0,
  702. 0,0,0,0,0,0,0,0,
  703. 0,0,0,0,0,0,0,0,
  704. 0,0,0,0,0,0,0,0,
  705. 0,0,0,0,0,0,0,0};
  706. // stolen from is_alnum, but turned on - and _
  707. const char g_map_is_tagname_char [] = { // 48-57, 65-90,97-122,192-255(excluding 215,247)
  708. 0,0,0,0,0,0,0,0, // 0
  709. 0,0,0,0,0,0,0,0,
  710. 0,0,0,0,0,0,0,0, // 16
  711. 0,0,0,0,0,0,0,0,
  712. 0,0,0,0,0,0,0,0, // 32
  713. 0,0,0,0,0,1,0,0, // -
  714. 1,1,1,1,1,1,1,1, // 48
  715. 1,1,1,0,0,0,0,0, // we include the : for feedburner:origlink
  716. 0,1,1,1,1,1,1,1, // 64
  717. 1,1,1,1,1,1,1,1,
  718. 1,1,1,1,1,1,1,1, // 80
  719. 1,1,1,0,0,0,0,1, // _
  720. 0,1,1,1,1,1,1,1, // 96
  721. 1,1,1,1,1,1,1,1,
  722. 1,1,1,1,1,1,1,1, // 112
  723. 1,1,1,0,0,0,0,0,
  724. 0,0,0,0,0,0,0,0, // 128
  725. 0,0,0,0,0,0,0,0,
  726. 0,0,0,0,0,0,0,0, // 144
  727. 0,0,0,0,0,0,0,0,
  728. 0,0,0,0,0,0,0,0, // 160
  729. 0,0,0,0,0,0,0,0,
  730. 0,0,0,0,0,0,0,0, // 176
  731. 0,0,0,0,0,0,0,0,
  732. // we are no longer necessarily latin-1!!
  733. 0,0,0,0,0,0,0,0, // 192
  734. 0,0,0,0,0,0,0,0,
  735. 0,0,0,0,0,0,0,0, // 208
  736. 0,0,0,0,0,0,0,0,
  737. 0,0,0,0,0,0,0,0, // 224
  738. 0,0,0,0,0,0,0,0,
  739. 0,0,0,0,0,0,0,0, // 240
  740. 0,0,0,0,0,0,0,0};
  741. const char g_map_is_tag_control_char[] = { // 48-57
  742. 0,0,0,0,0,0,0,0, // 0
  743. 0,0,0,0,0,0,0,0,
  744. 0,0,0,0,0,0,0,0, // 16
  745. 0,0,0,0,0,0,0,0,
  746. 0,0,1,0,0,0,0,1, // 32 " and '
  747. 0,0,0,0,0,0,0,0,
  748. 0,0,0,0,0,0,0,0, // 48
  749. 0,0,0,0,1,0,1,0, // 56 < and >
  750. 0,0,0,0,0,0,0,0, // 64
  751. 0,0,0,0,0,0,0,0,
  752. 0,0,0,0,0,0,0,0,
  753. 0,0,0,0,0,0,0,0,
  754. 0,0,0,0,0,0,0,0,
  755. 0,0,0,0,0,0,0,0,
  756. 0,0,0,0,0,0,0,0,
  757. 0,0,0,0,0,0,0,0,
  758. 0,0,0,0,0,0,0,0,
  759. 0,0,0,0,0,0,0,0,
  760. 0,0,0,0,0,0,0,0,
  761. 0,0,0,0,0,0,0,0,
  762. 0,0,0,0,0,0,0,0,
  763. 0,0,0,0,0,0,0,0,
  764. 0,0,0,0,0,0,0,0,
  765. 0,0,0,0,0,0,0,0,
  766. 0,0,0,0,0,0,0,0,
  767. 0,0,0,0,0,0,0,0,
  768. 0,0,0,0,0,0,0,0,
  769. 0,0,0,0,0,0,0,0,
  770. 0,0,0,0,0,0,0,0,
  771. 0,0,0,0,0,0,0,0,
  772. 0,0,0,0,0,0,0,0,
  773. 0,0,0,0,0,0,0,0};
  774. // when matching query terms to words/phrases in doc skip over spaces
  775. // or other punct so that "flypaper" in the query matches "fly paper" in the
  776. // doc
  777. /*
  778. const char g_map_is_match_skip[] = { // 48-57
  779. 0,0,0,0,0,0,0,0, // 0
  780. 0,1,1,0,0,0,0,0, // \t and \n
  781. 0,0,0,0,0,0,0,0, // 16
  782. 0,0,0,0,0,0,0,0,
  783. 1,0,0,0,0,0,0,1, // 32 space and '
  784. 0,0,0,0,0,1,0,0, // 40 -
  785. 0,0,0,0,0,0,0,0, // 48
  786. 0,0,0,0,0,0,0,0, // 56
  787. 0,0,0,0,0,0,0,0, // 64
  788. 0,0,0,0,0,0,0,0,
  789. 0,0,0,0,0,0,0,0,
  790. 0,0,0,0,0,0,0,0,
  791. 0,0,0,0,0,0,0,0,
  792. 0,0,0,0,0,0,0,0,
  793. 0,0,0,0,0,0,0,0,
  794. 0,0,0,0,0,0,0,0,
  795. 0,0,0,0,0,0,0,0,
  796. 0,0,0,0,0,0,0,0,
  797. 0,0,0,0,0,0,0,0,
  798. 0,0,0,0,0,0,0,0,
  799. 0,0,0,0,0,0,0,0,
  800. 0,0,0,0,0,0,0,0,
  801. 0,0,0,0,0,0,0,0,
  802. 0,0,0,0,0,0,0,0,
  803. 0,0,0,0,0,0,0,0,
  804. 0,0,0,0,0,0,0,0,
  805. 0,0,0,0,0,0,0,0,
  806. 0,0,0,0,0,0,0,0,
  807. 0,0,0,0,0,0,0,0,
  808. 0,0,0,0,0,0,0,0,
  809. 0,0,0,0,0,0,0,0,
  810. 0,0,0,0,0,0,0,0};
  811. */
  812. // seems like this should be defined, but it isn't
  813. int32_t strnlen ( const char *s , int32_t maxLen ) {
  814. int32_t i ;
  815. for ( i = 0 ; i < maxLen ; i++ ) if ( ! s[i] ) return i;
  816. return i;
  817. }
  818. char *strncasestr( char *haystack, int32_t haylen, char *needle){
  819. int32_t matchLen = 0;
  820. int32_t needleLen = gbstrlen(needle);
  821. for (int32_t i = 0; i < haylen;i++){
  822. char c1 = to_lower_a(haystack[i]);
  823. char c2 = to_lower_a(needle[matchLen]);
  824. if ( c1 != c2 ){
  825. // no match
  826. matchLen = 0;
  827. continue;
  828. }
  829. // we matched another character
  830. matchLen++;
  831. if (matchLen < needleLen) continue;
  832. // we've matched the whole string
  833. return haystack + i - matchLen + 1;
  834. }
  835. return NULL;
  836. }
  837. char *strnstr2( char *haystack, int32_t haylen, char *needle){
  838. int32_t matchLen = 0;
  839. int32_t needleLen = gbstrlen(needle);
  840. for (int32_t i = 0; i < haylen;i++){
  841. char c1 = (haystack[i]);
  842. char c2 = (needle[matchLen]);
  843. if ( c1 != c2 ){
  844. // no match
  845. matchLen = 0;
  846. continue;
  847. }
  848. // we matched another character
  849. matchLen++;
  850. if (matchLen < needleLen) continue;
  851. // we've matched the whole string
  852. return haystack + i - matchLen + 1;
  853. }
  854. return NULL;
  855. }
  856. // . get the # of words in this string
  857. int32_t getNumWords ( char *s , int32_t len, int32_t titleVersion ) {
  858. int32_t wordCount = 0;
  859. bool inWord = false;
  860. for ( int32_t i = 0 ; i < len ; i++ ) {
  861. if ( ! is_alnum_a ( s[i] ) && s[i]!='\'' ) {
  862. inWord = false;
  863. continue;
  864. }
  865. if ( ! inWord ) {
  866. inWord = true;
  867. wordCount++;
  868. }
  869. }
  870. return wordCount;
  871. }
  872. // . this stores a "n" into "s" and returns the # of bytes written into "s"
  873. // . it also puts commas into the number
  874. // . it now also NULL terminates bytes written into "s"
  875. int32_t ulltoa ( char *s , uint64_t n ) {
  876. // if n is zero, it's easy
  877. if ( n == 0LL ) { *s++='0'; *s='\0'; return 1; }
  878. // a hunk is a number in [0,999]
  879. int32_t hunks[10];
  880. int32_t lastHunk = -1;
  881. // . get the hunks
  882. // . the first hunk we get is called the "lowest hunk"
  883. // . "lastHunk" is called the "highest hunk"
  884. for ( int32_t i = 0 ; i < 10 ; i++ ) {
  885. hunks[i] = n % 1000;
  886. n /= 1000;
  887. if ( hunks[i] != 0 ) lastHunk = i;
  888. }
  889. // remember start of buf for calculating # bytes written
  890. char *start = s;
  891. // print the hunks separated by comma
  892. for ( int32_t i = lastHunk ; i >= 0 ; i-- ) {
  893. // pad all hunks except highest hunk with zeroes
  894. if ( i != lastHunk ) sprintf ( s , "%03"INT32"" , hunks[i] );
  895. else sprintf ( s , "%"INT32"" , hunks[i] );
  896. s += gbstrlen(s);
  897. // comma after all hunks but lowest hunk
  898. if ( i != 0 ) *s++ = ',';
  899. }
  900. // null terminate it
  901. *s = '\0';
  902. // return # of bytes stored into "s"
  903. return s - start;
  904. }
  905. /*
  906. int32_t atol2 ( const char *s, int32_t len ) {
  907. char tmp[32];
  908. if ( len > 30 ) len = 30;
  909. gbmemcpy ( tmp , s , len );
  910. tmp [ len ] = '\0';
  911. return atol ( s );
  912. }
  913. */
  914. int32_t atol2 ( const char *s, int32_t len ) {
  915. // skip over spaces
  916. const char *end = s + len;
  917. while ( s < end && is_wspace_a ( *s ) ) s++;
  918. // return 0 if all spaces
  919. if ( s == end ) return 0;
  920. int32_t i = 0;
  921. int32_t val = 0;
  922. bool negative = false;
  923. if ( s[0] == '-' ) { negative = true; i++; }
  924. while ( i < len && is_digit(s[i]) ) val = val * 10 + ( s[i++] - '0' );
  925. if ( negative ) return -val;
  926. return val;
  927. }
  928. int64_t atoll1 ( const char *s ) {
  929. return atoll ( s );
  930. }
  931. int64_t atoll2 ( const char *s, int32_t len ) {
  932. // skip over spaces
  933. const char *end = s + len;
  934. while ( s < end && is_wspace_a ( *s ) ) s++;
  935. // return 0 if all spaces
  936. if ( s == end ) return 0;
  937. int32_t i = 0;
  938. int64_t val = 0LL;
  939. bool negative = false;
  940. if ( s[0] == '-' ) { negative = true; i++; }
  941. while ( i < len && is_digit(s[i]) ) val = val * 10LL + ( s[i++] - '0');
  942. if ( negative ) return -val;
  943. return val;
  944. }
  945. double atof2 ( const char *s, int32_t len ) {
  946. // skip over spaces
  947. const char *end = s + len;
  948. while ( s < end && is_wspace_a ( *s ) ) { s++; len--; }
  949. // return 0 if all spaces
  950. if ( s == end ) return 0;
  951. char tmpBuf[128];
  952. if ( len >= 128 ) len = 127;
  953. //strncpy ( dst , s , len );
  954. const char *p = s;
  955. const char *srcEnd = s + len;
  956. char *dst = tmpBuf;
  957. // remove commas
  958. for ( ; p < srcEnd ; p++ ) {
  959. // skip commas
  960. if ( *p == ',' ) continue;
  961. // otherwise store it
  962. *dst++ = *p;
  963. }
  964. // null term
  965. *dst = '\0';
  966. //buf[len] = '\0';
  967. return atof ( tmpBuf );
  968. }
  969. double atod2 ( char *s, int32_t len ) {
  970. // point to end
  971. char *end = s + len;
  972. // null term temp
  973. char c = *end;
  974. *end = '\0';
  975. // get it
  976. double ret = strtod ( s , NULL );
  977. // undo it
  978. *end = c;
  979. return ret;
  980. }
  981. bool atob ( const char *s, int32_t len ) {
  982. // skip over spaces
  983. const char *end = s + len;
  984. while ( s < end && is_wspace_a ( *s ) ) s++;
  985. // return false if all spaces
  986. if ( s == end ) return false;
  987. // parse the ascii bool value
  988. if ( s[0] == 't' || s[0] == 'T' ) return true;
  989. if ( s[0] == 'y' || s[0] == 'Y' ) return true;
  990. if ( ! is_digit ( *s ) || *s == '0' ) return false;
  991. return true;
  992. }
  993. // hexadecimal ascii to key_t
  994. int64_t htoint32_tint32_t ( const char *s, int32_t len ) {
  995. // skip over spaces
  996. const char *end = s + len;
  997. while ( s < end && is_wspace_a ( *s ) ) s++;
  998. // return 0 if all spaces
  999. if ( s == end ) return 0;
  1000. int32_t i = 0;
  1001. int64_t val = 0;
  1002. while ( i < len && is_hex(s[i]) )
  1003. val = val * 16 + htob ( s[i++] );
  1004. return val;
  1005. }
  1006. // convert hex ascii string into binary at "dst"
  1007. void hexToBin ( char *src , int32_t srcLen , char *dst ) {
  1008. char *srcEnd = src + srcLen;
  1009. for ( ; src && src < srcEnd ; ) {
  1010. *dst = htob(*src++);
  1011. *dst <<= 4;
  1012. *dst |= htob(*src++);
  1013. dst++;
  1014. }
  1015. // sanity check
  1016. if ( src != srcEnd ) { char *xx=NULL;*xx=0; }
  1017. }
  1018. void binToHex ( unsigned char *src , int32_t srcLen , char *dst ) {
  1019. unsigned char *srcEnd = src + srcLen;
  1020. for ( ; src && src < srcEnd ; ) {
  1021. *dst++ = btoh(*src>>4);
  1022. *dst++ = btoh(*src&15);
  1023. src++;
  1024. }
  1025. // always null term!
  1026. *dst = '\0';
  1027. // sanity check
  1028. if ( src != srcEnd ) { char *xx=NULL;*xx=0; }
  1029. }
  1030. // . like strstr but haystack may not be NULL terminated
  1031. // . needle, however, IS null terminated
  1032. char *strncasestr ( char *haystack , char *needle , int32_t haystackSize ) {
  1033. int32_t needleSize = gbstrlen(needle);
  1034. int32_t n = haystackSize - needleSize ;
  1035. for ( int32_t i = 0 ; i <= n ; i++ ) {
  1036. // keep looping if first chars do not match
  1037. if ( to_lower_a(haystack[i]) != to_lower_a(needle[0]) )
  1038. continue;
  1039. // if needle was only 1 char it's a match
  1040. if ( ! needle[1] ) return &haystack[i];
  1041. // compare the whole strings now
  1042. if ( strncasecmp ( &haystack[i] , needle , needleSize ) == 0 )
  1043. return &haystack[i];
  1044. }
  1045. return NULL;
  1046. }
  1047. // . like strstr but haystack may not be NULL terminated
  1048. // . needle, however, IS null terminated
  1049. char *strncasestr ( char *haystack , char *needle ,
  1050. int32_t haystackSize, int32_t needleSize ) {
  1051. int32_t n = haystackSize - needleSize ;
  1052. for ( int32_t i = 0 ; i <= n ; i++ ) {
  1053. // keep looping if first chars do not match
  1054. if ( to_lower_a(haystack[i]) != to_lower_a(needle[0]) )
  1055. continue;
  1056. // if needle was only 1 char it's a match
  1057. if ( ! needle[1] ) return &haystack[i];
  1058. // compare the whole strings now
  1059. if ( strncasecmp ( &haystack[i] , needle , needleSize ) == 0 )
  1060. return &haystack[i];
  1061. }
  1062. return NULL;
  1063. }
  1064. char *strnstr ( char *haystack , char *needle , int32_t haystackSize ) {
  1065. int32_t needleSize = gbstrlen(needle);
  1066. int32_t n = haystackSize - needleSize ;
  1067. for ( int32_t i = 0 ; i <= n ; i++ ) {
  1068. // keep looping if first chars do not match
  1069. if ( haystack[i] != needle[0] ) continue;
  1070. // if needle was only 1 char it's a match
  1071. if ( ! needle[1] ) return &haystack[i];
  1072. // compare the whole strings now
  1073. if ( strncmp ( &haystack[i] , needle , needleSize ) == 0 )
  1074. return &haystack[i];
  1075. }
  1076. return NULL;
  1077. }
  1078. // independent of case
  1079. char *gb_strcasestr ( char *haystack , char *needle ) {
  1080. int32_t needleSize = gbstrlen(needle);
  1081. int32_t haystackSize = gbstrlen(haystack);
  1082. int32_t n = haystackSize - needleSize ;
  1083. for ( int32_t i = 0 ; i <= n ; i++ ) {
  1084. // keep looping if first chars do not match
  1085. if ( to_lower_a(haystack[i]) != to_lower_a(needle[0]) )
  1086. continue;
  1087. // if needle was only 1 char it's a match
  1088. if ( ! needle[1] ) return &haystack[i];
  1089. // compare the whole strings now
  1090. if ( strncasecmp ( &haystack[i] , needle , needleSize ) == 0 )
  1091. return &haystack[i];
  1092. }
  1093. return NULL;
  1094. }
  1095. char *gb_strncasestr ( char *haystack , int32_t haystackSize , char *needle ) {
  1096. // temp term
  1097. char c = haystack[haystackSize];
  1098. haystack[haystackSize] = '\0';
  1099. char *res = gb_strcasestr ( haystack , needle );
  1100. haystack[haystackSize] = c;
  1101. return res;
  1102. }
  1103. // . convert < to &lt; and > to &gt
  1104. // . store "t" into "s"
  1105. // . returns bytes stored into "s"
  1106. // . NULL terminates "s" if slen > 0
  1107. int32_t saftenTags ( char *s , int32_t slen , char *t , int32_t tlen ) {
  1108. char *start = s ;
  1109. // bail if slen is 0
  1110. if ( slen <= 0 ) return 0;
  1111. // leave a char for the \0
  1112. char *send = s + slen - 1;
  1113. char *tend = t + tlen;
  1114. for ( ; t < tend && s + 4 < send ; t++ ) {
  1115. if ( *t == '<' ) {
  1116. *s++ = '&';
  1117. *s++ = 'l';
  1118. *s++ = 't';
  1119. *s++ = ';';
  1120. continue;
  1121. }
  1122. if ( *t == '>' ) {
  1123. *s++ = '&';
  1124. *s++ = 'g';
  1125. *s++ = 't';
  1126. *s++ = ';';
  1127. continue;
  1128. }
  1129. *s++ = *t;
  1130. }
  1131. // NULL terminate "s"
  1132. *s = '\0';
  1133. // return # of bytes, excluding \0, stored into s
  1134. return s - start;
  1135. }
  1136. // . if "doSpecial" is true, then we change &lt;, &gt; and &amp; to
  1137. // the following:
  1138. // UnicodeData.txt:22E6;LESS-THAN BUT NOT EQUIVALENT TO;Sm;0;ON;;;;;Y;
  1139. // UnicodeData.txt:22E7;GREATER-THAN BUT NOT EQUIVALENT TO;Sm;0;ON;;;;;Y;
  1140. // UnicodeData.txt:E0026;TAG AMPERSAND;Cf;0;BN;;;;;N;;;;;
  1141. // UnicodeData.txt:235E;APL FUNCTIONAL SYMBOL QUOTE QUAD;So;0;L;;;;;N;;;;;
  1142. int32_t htmlDecode ( char *dst , char *src , int32_t srcLen , bool doSpecial ,
  1143. int32_t niceness ) {
  1144. if ( srcLen == 0 ) return 0;
  1145. char *start = dst;
  1146. char *srcEnd = src + srcLen;
  1147. for ( ; src < srcEnd ; ) {
  1148. // breathe
  1149. QUICKPOLL(niceness);
  1150. // utf8 support?
  1151. char size = getUtf8CharSize(src);
  1152. // all entities must start with '&'
  1153. if ( *src != '&' ) {
  1154. if ( size == 1 ) { *dst++ = *src++; continue; }
  1155. gbmemcpy ( dst , src , size );
  1156. src += size;
  1157. dst += size;
  1158. continue;
  1159. //*dst++ = *src++; continue; }
  1160. }
  1161. // TODO: avoid doSpecial by not decoding crap in tags...
  1162. //if ( src[0] == '<' ) {
  1163. // // skip to tag end then!
  1164. //
  1165. // store decoded entity char into dst[j]
  1166. uint32_t c;
  1167. // "skip" is how many bytes the entities was in "src"
  1168. int32_t skip = getEntity_a (src, srcEnd-src, &c );
  1169. // ignore the "entity" if it was invalid
  1170. if ( skip == 0 ) { *dst++ = *src++ ; continue; }
  1171. // force this now always since some tags contain &quot;
  1172. // and it was causing the tags to be terminated too early
  1173. // for richmondspca.org
  1174. //if ( c == '\"' ) c = '\'';
  1175. //if ( c == '<' ) c = '[';
  1176. //if ( c == '>' ) c = ']';
  1177. // . special mapping
  1178. // . make &lt; and &gt; special so Xml::set() still works
  1179. // . and make &amp; special so we do not screw up summaries
  1180. if ( doSpecial ) {
  1181. // no longer use this!
  1182. //char *xx=NULL;*xx=0;
  1183. if ( c == '<' ) {
  1184. // using [ and ] looks bad in event titles...
  1185. *dst = '|';
  1186. dst++;
  1187. src += skip;
  1188. continue;
  1189. gbmemcpy(dst,"+!-",3);
  1190. //gbmemcpy(dst,"<gb",3);
  1191. dst += 3;
  1192. src += skip;
  1193. continue;
  1194. // paragraph sign:
  1195. //c = 0xc2b6;
  1196. }
  1197. if ( c == '>' ) {
  1198. // using [ and ] looks bad in event titles...
  1199. *dst = '|';
  1200. dst++;
  1201. src += skip;
  1202. continue;
  1203. //gbmemcpy(dst,"gb>",3);
  1204. gbmemcpy(dst,"-!+",3);
  1205. dst += 3;
  1206. src += skip;
  1207. continue;
  1208. // high-rise hyphen:
  1209. //c = 0xc2af;
  1210. }
  1211. // some tags have &quot; in their value strings
  1212. // so we have to preserve that!
  1213. // use curling quote:
  1214. //http://www.dwheeler.com/essays/quotes-test-utf-8.html
  1215. // curling double and single quotes resp:
  1216. // &ldquo; &rdquo; &lsquo; &rdquo;
  1217. if ( c == '\"' ) {
  1218. //c = 0x201c; // 0x235e;
  1219. *dst = '\'';
  1220. dst++;
  1221. src += skip;
  1222. continue;
  1223. }
  1224. //if ( c == '<' ) c = 0x22d6; // e6;
  1225. //if ( c == '>' ) c = 0x22d7; // e7;
  1226. // this was working ok, but just code it to an
  1227. // ampersand. when displaying a page we can code all
  1228. // ampersands back into &amp; i guess! that way
  1229. // the check for a " & " in the place name in
  1230. // Address.cpp works out...
  1231. //if ( c == '&' ) c = 0xff06; // full width ampersand
  1232. }
  1233. // . otherwise it was a legit entity
  1234. // . store it into "dst" in utf8 format
  1235. // . "numBytes" is how many bytes it stored into 'dst"
  1236. int32_t numBytes = utf8Encode ( c , dst );
  1237. // sanity check. do not eat our tail if dst == src
  1238. if ( numBytes > skip ) { char *xx=NULL;*xx=0; }
  1239. // advance dst ptr
  1240. dst += numBytes;
  1241. // skip over the encoded entity in the source string
  1242. src += skip;
  1243. }
  1244. // NULL term
  1245. *dst = '\0';
  1246. return dst - start;
  1247. }
  1248. // cdata
  1249. int32_t cdataDecode ( char *dst , char *src , int32_t niceness ) {
  1250. if ( ! src ) return 0;
  1251. char *start = dst;
  1252. for ( ; *src ; ) {
  1253. // breathe
  1254. QUICKPOLL(niceness);
  1255. // utf8 support?
  1256. char size = getUtf8CharSize(src);
  1257. // see SafeBuf::cdataEncode() we do the opposite here
  1258. if ( src[0] != ']' ||
  1259. src[1] != ']' ||
  1260. src[2] != '&' ||
  1261. src[3] != 'g' ||
  1262. src[4] != 't' ) {
  1263. if ( size == 1 ) { *dst++ = *src++; continue; }
  1264. gbmemcpy ( dst , src , size );
  1265. src += size;
  1266. dst += size;
  1267. continue;
  1268. //*dst++ = *src++; continue; }
  1269. }
  1270. // make it ]]>
  1271. gbmemcpy ( dst , "]]>" , 3 );
  1272. src += 5;
  1273. dst += 3;
  1274. }
  1275. // NULL term
  1276. *dst = '\0';
  1277. return dst - start;
  1278. }
  1279. // . make something safe as an form input value by translating the quotes
  1280. // . store "t" into "s" and return bytes stored
  1281. // . does not do bounds checking
  1282. int32_t dequote ( char *s , char *send , char *t , int32_t tlen ) {
  1283. char *start = s;
  1284. char *tend = t + tlen;
  1285. for ( ; t < tend && s < send ; t++ ) {
  1286. if ( *t == '"' ) {
  1287. if ( s + 5 >= send ) return 0;
  1288. *s++ = '&';
  1289. *s++ = '#';
  1290. *s++ = '3';
  1291. *s++ = '4';
  1292. *s++ = ';';
  1293. continue;
  1294. }
  1295. *s++ = *t;
  1296. }
  1297. // all or nothing
  1298. if ( s + 1 >= send ) return 0;
  1299. *s = '\0';
  1300. return s - start;
  1301. }
  1302. bool dequote ( SafeBuf* sb , char *t , int32_t tlen ) {
  1303. char *tend = t + tlen;
  1304. for ( ; t < tend; t++ ) {
  1305. if ( *t == '"' ) {
  1306. sb->safeMemcpy("&#34;", 5);
  1307. continue;
  1308. }
  1309. *sb += *t;
  1310. }
  1311. *sb += '\0';
  1312. return true;
  1313. }
  1314. //int32_t dequote ( char *s , char *t ) {
  1315. // return dequote ( s , t , gbstrlen ( t ) );
  1316. //}
  1317. // . entity-ize a string so it's safe for html output
  1318. // . store "t" into "s" and return bytes stored
  1319. // . does bounds checking
  1320. char *htmlEncode ( char *s , char *send , char *t , char *tend , bool pound ,
  1321. int32_t niceness ) {
  1322. for ( ; t < tend ; t++ ) {
  1323. QUICKPOLL(niceness);
  1324. if ( s + 7 >= send ) { *s = '\0'; return s; }
  1325. if ( *t == '"' ) {
  1326. *s++ = '&';
  1327. *s++ = '#';
  1328. *s++ = '3';
  1329. *s++ = '4';
  1330. *s++ = ';';
  1331. continue;
  1332. }
  1333. if ( *t == '<' ) {
  1334. *s++ = '&';
  1335. *s++ = 'l';
  1336. *s++ = 't';
  1337. *s++ = ';';
  1338. continue;
  1339. }
  1340. if ( *t == '>' ) {
  1341. *s++ = '&';
  1342. *s++ = 'g';
  1343. *s++ = 't';
  1344. *s++ = ';';
  1345. continue;
  1346. }
  1347. if ( *t == '&' ) {
  1348. *s++ = '&';
  1349. *s++ = 'a';
  1350. *s++ = 'm';
  1351. *s++ = 'p';
  1352. *s++ = ';';
  1353. continue;
  1354. }
  1355. if ( *t == '#' && pound ) {
  1356. *s++ = '&';
  1357. *s++ = '#';
  1358. *s++ = '0';
  1359. *s++ = '3';
  1360. *s++ = '5';
  1361. *s++ = ';';
  1362. continue;
  1363. }
  1364. *s++ = *t;
  1365. }
  1366. *s = '\0';
  1367. return s;
  1368. }
  1369. // . entity-ize a string so it's safe for html output
  1370. // . store "t" into "s" and return true on success
  1371. bool htmlEncode ( SafeBuf* s , char *t , char *tend , bool pound ,
  1372. int32_t niceness ) {
  1373. for ( ; t < tend ; t++ ) {
  1374. QUICKPOLL(niceness);
  1375. if ( *t == '"' ) {
  1376. s->safeMemcpy("&#34;", 5);
  1377. continue;
  1378. }
  1379. if ( *t == '<' ) {
  1380. s->safeMemcpy("&lt;", 4);
  1381. continue;
  1382. }
  1383. if ( *t == '>' ) {
  1384. s->safeMemcpy("&gt;", 4);
  1385. continue;
  1386. }
  1387. if ( *t == '&' ) {
  1388. s->safeMemcpy("&amp;", 5);
  1389. continue;
  1390. }
  1391. if ( *t == '#' && pound ) {
  1392. s->safeMemcpy("&#035;", 6);
  1393. continue;
  1394. }
  1395. // our own specially decoded entities!
  1396. if ( *t == '+' && t[1]=='!' && t[2]=='-' ) {
  1397. s->safeMemcpy("&lt;",4);
  1398. continue;
  1399. }
  1400. // our own specially decoded entities!
  1401. if ( *t == '-' && t[1]=='!' && t[2]=='+' ) {
  1402. s->safeMemcpy("&gt;",4);
  1403. continue;
  1404. }
  1405. *s += *t;
  1406. }
  1407. *s += '\0';
  1408. return true;
  1409. }
  1410. // . convert "-->%22 , &-->%26, +-->%2b, space-->+, ?-->%3f is that it?
  1411. // . convert so we can display as a cgi PARAMETER within a url
  1412. // . used by HttPage2 (cached web page) to encode the query into a url
  1413. // . used by PageRoot to do likewise
  1414. // . returns bytes written into "d" not including terminating \0
  1415. int32_t urlEncode ( char *d , int32_t dlen , char *s , int32_t slen, bool requestPath ) {
  1416. char *dstart = d;
  1417. // subtract 1 to make room for a terminating \0
  1418. char *dend = d + dlen - 1;
  1419. char *send = s + slen;
  1420. for ( ; s < send && d < dend ; s++ ) {
  1421. if ( *s == '\0' && requestPath ) {
  1422. *d++ = *s;
  1423. continue;
  1424. }
  1425. // encode if not fit for display
  1426. if ( ! is_ascii ( *s ) ) goto encode;
  1427. switch ( *s ) {
  1428. case ' ': goto encode;
  1429. case '&': goto encode;
  1430. case '"': goto encode;
  1431. case '+': goto encode;
  1432. case '%': goto encode;
  1433. case '#': goto encode;
  1434. // encoding < and > are more for displaying on an
  1435. // html page than sending to an http server
  1436. case '>': goto encode;
  1437. case '<': goto encode;
  1438. case '?': if ( requestPath ) break;
  1439. goto encode;
  1440. }
  1441. // otherwise, no need to encode
  1442. *d++ = *s;
  1443. continue;
  1444. encode:
  1445. // space to +
  1446. if ( *s == ' ' && d + 1 < dend ) { *d++ = '+'; continue; }
  1447. // break out if no room to encode
  1448. if ( d + 2 >= dend ) break;
  1449. *d++ = '%';
  1450. // store first hex digit
  1451. unsigned char v = ((unsigned char)*s)/16 ;
  1452. if ( v < 10 ) v += '0';
  1453. else v += 'A' - 10;
  1454. *d++ = v;
  1455. // store second hex digit
  1456. v = ((unsigned char)*s) & 0x0f ;
  1457. if ( v < 10 ) v += '0';
  1458. else v += 'A' - 10;
  1459. *d++ = v;
  1460. }
  1461. // NULL terminate it
  1462. *d = '\0';
  1463. // and return the length
  1464. return d - dstart;
  1465. }
  1466. // determine the length of the encoded url, does NOT include NULL
  1467. int32_t urlEncodeLen ( char *s , int32_t slen , bool requestPath ) {
  1468. int32_t dLen = 0;
  1469. char *send = s + slen;
  1470. for ( ; s < send ; s++ ) {
  1471. if ( *s == '\0' && requestPath ) {
  1472. dLen++;
  1473. continue;
  1474. }
  1475. // encode if not fit for display
  1476. if ( ! is_ascii ( *s ) ) goto encode;
  1477. switch ( *s ) {
  1478. case ' ': goto encode;
  1479. case '&': goto encode;
  1480. case '"': goto encode;
  1481. case '+': goto encode;
  1482. case '%': goto encode;
  1483. case '#': goto encode;
  1484. // encoding < and > are more for displaying on an
  1485. // html page than sending to an http server
  1486. case '>': goto encode;
  1487. case '<': goto encode;
  1488. case '?': if ( requestPath ) break;
  1489. goto encode;
  1490. }
  1491. // otherwise, no need to encode
  1492. dLen++;
  1493. continue;
  1494. encode:
  1495. // space to +
  1496. if ( *s == ' ' ) { dLen++; continue; }
  1497. // hex code
  1498. dLen += 3; // %XX
  1499. }
  1500. //dLen++; // NULL TERM
  1501. // and return the length
  1502. return dLen;
  1503. }
  1504. // . decodes "s/slen" and stores into "dest"
  1505. // . returns the number of bytes stored into "dest"
  1506. int32_t urlDecode ( char *dest , char *s , int32_t slen ) {
  1507. int32_t j = 0;
  1508. for ( int32_t i = 0 ; i < slen ; i++ ) {
  1509. if ( s[i] == '+' ) { dest[j++]=' '; continue; }
  1510. dest[j++] = s[i];
  1511. if ( s[i] != '%' ) continue;
  1512. if ( i + 2 >= slen ) continue;
  1513. // if two chars after are not hex chars, it's not an encoding
  1514. if ( ! is_hex ( s[i+1] ) ) continue;
  1515. if ( ! is_hex ( s[i+2] ) ) continue;
  1516. // convert hex chars to values
  1517. unsigned char a = htob ( s[i+1] ) * 16;
  1518. unsigned char b = htob ( s[i+2] ) ;
  1519. dest[j-1] = (char) (a + b);
  1520. i += 2;
  1521. }
  1522. return j;
  1523. }
  1524. int32_t urlDecodeNoZeroes ( char *dest , char *s , int32_t slen ) {
  1525. int32_t j = 0;
  1526. for ( int32_t i = 0 ; i < slen ; i++ ) {
  1527. if ( s[i] == '+' ) { dest[j++]=' '; continue; }
  1528. dest[j++] = s[i];
  1529. if ( s[i] != '%' ) continue;
  1530. if ( i + 2 >= slen ) continue;
  1531. // if two chars after are not hex chars, it's not an encoding
  1532. if ( ! is_hex ( s[i+1] ) ) continue;
  1533. if ( ! is_hex ( s[i+2] ) ) continue;
  1534. // convert hex chars to values
  1535. unsigned char a = htob ( s[i+1] ) * 16;
  1536. unsigned char b = htob ( s[i+2] ) ;
  1537. // NO ZEROES! fixes &content= having decoded \0's in it
  1538. // and setting our parms
  1539. if ( a + b == 0 ) {
  1540. log("fctypes: urlDecodeNoZeros encountered url "
  1541. "encoded zero. truncating http request.");
  1542. return j;
  1543. }
  1544. dest[j-1] = (char) (a + b);
  1545. i += 2;
  1546. }
  1547. return j;
  1548. }
  1549. // . like above, but only decodes chars that should not have been encoded
  1550. // . will also encode binary chars
  1551. int32_t urlNormCode ( char *d , int32_t dlen , char *s , int32_t slen ) {
  1552. // save start of destination buffer for returning the length
  1553. char *dstart = d;
  1554. // subtract 1 for NULL termination
  1555. char *dend = d + dlen - 1;
  1556. char *send = s + slen;
  1557. for ( ; s < send && d < dend ; s++ ) {
  1558. // if its non-ascii, encode it so it displays correctly
  1559. if ( ! is_ascii ( *s ) ) {
  1560. // break if no room to encode it
  1561. if ( d + 2 >= dend ) break;
  1562. // store it encoded
  1563. *d++ = '%';
  1564. // store first hex digit
  1565. unsigned char v = ((unsigned char)*s)/16 ;
  1566. if ( v < 10 ) v += '0';
  1567. else v += 'A' - 10;
  1568. *d++ = v;
  1569. // store second hex digit
  1570. v = ((unsigned char)*s) & 0x0f ;
  1571. if ( v < 10 ) v += '0';
  1572. else v += 'A' - 10;
  1573. *d++ = v;
  1574. continue;
  1575. }
  1576. // store it
  1577. *d++ = *s;
  1578. // but it might be something encoded that should not have been
  1579. if ( *s != '%' ) continue;
  1580. // it requires to following chars to decode
  1581. if ( s + 2 >= send ) continue;
  1582. // if two chars after are not hex chars, it's not an encoding
  1583. if ( ! is_hex ( s[1] ) ) continue;
  1584. if ( ! is_hex ( s[2] ) ) continue;
  1585. // convert hex chars to values
  1586. unsigned char a = htob ( s[1] ) * 16;
  1587. unsigned char b = htob ( s[2] ) ;
  1588. unsigned char v = a + b;
  1589. // don't decode if it decodes in these chars
  1590. switch ( v ) {
  1591. case ' ': continue;
  1592. case '&': continue;
  1593. case '"': continue;
  1594. case '+': continue;
  1595. case '%': continue;
  1596. case '>': continue;
  1597. case '<': continue;
  1598. case '?': continue;
  1599. case '=': continue;
  1600. }
  1601. // otherwise, it's fine to decode it
  1602. d[-1] = (char) (a + b);
  1603. // skip over those 2 chars as well as leading '%'
  1604. s += 2;
  1605. }
  1606. // NULL terminate
  1607. *d = '\0';
  1608. // return length
  1609. return d - dstart ;
  1610. }
  1611. // approximate # of non-punct words
  1612. int32_t getNumWords ( char *s ) {
  1613. int32_t count = 0;
  1614. loop:
  1615. // skip punct
  1616. while ( ! is_alnum_a(*s) ) s++;
  1617. // bail if done
  1618. if ( !*s ) return count;
  1619. // count a word
  1620. count++;
  1621. // skip word
  1622. while ( is_alnum_a(*s) ) s++;
  1623. // watch for ' letter punct
  1624. if ( *s=='\'' && is_alnum_a(*(s+1)) && !is_alnum_a(*(s+2)) ) {
  1625. // skip apostrophe
  1626. s++;
  1627. // skip rest of word
  1628. while ( is_alnum_a(*s) ) s++;
  1629. }
  1630. goto loop;
  1631. }
  1632. static int64_t s_adjustment = 0;
  1633. int64_t globalToLocalTimeMilliseconds ( int64_t global ) {
  1634. // sanity check
  1635. //if ( ! g_clockInSync )
  1636. // log("gb: Converting global time but clock not in sync.");
  1637. return global - s_adjustment;
  1638. }
  1639. int64_t localToGlobalTimeMilliseconds ( int64_t local ) {
  1640. // sanity check
  1641. //if ( ! g_clockInSync )
  1642. // log("gb: Converting global time but clock not in sync.");
  1643. return local + s_adjustment;
  1644. }
  1645. int32_t globalToLocalTimeSeconds ( int32_t global ) {
  1646. // sanity check
  1647. //if ( ! g_clockInSync )
  1648. // log("gb: Converting global time but clock not in sync.");
  1649. return global - (s_adjustment/1000);
  1650. }
  1651. int32_t localToGlobalTimeSeconds ( int32_t local ) {
  1652. // sanity check
  1653. //if ( ! g_clockInSync )
  1654. // log("gb: Converting global time but clock not in sync.");
  1655. return local + (s_adjustment/1000);
  1656. }
  1657. #include "Timedb.h"
  1658. static char s_tafile[1024];
  1659. static bool s_hasFileName = false;
  1660. // returns false and sets g_errno on error
  1661. bool setTimeAdjustmentFilename ( char *dir, char *filename ) {
  1662. s_hasFileName = true;
  1663. int32_t len1 = gbstrlen(dir);
  1664. int32_t len2 = gbstrlen(filename);
  1665. if ( len1 + len2 > 1000 ) { char *xx=NULL;*xx=0; }
  1666. sprintf(s_tafile,"%s/%s",dir,filename);
  1667. return true;
  1668. }
  1669. // returns false and sets g_errno on error
  1670. bool loadTimeAdjustment ( ) {
  1671. // bail if no filename to read
  1672. if ( ! s_hasFileName ) return true;
  1673. // read it in
  1674. // one line in text
  1675. int fd = open ( s_tafile , O_RDONLY );
  1676. if ( fd < 0 ) {
  1677. log("util: could not open %s for reading",s_tafile);
  1678. g_errno = errno;
  1679. return false;
  1680. }
  1681. char rbuf[1024];
  1682. // read in max bytes
  1683. int nr = read ( fd , rbuf , 1000 );
  1684. if ( nr <= 10 || nr > 1000 ) {
  1685. log("util: reading %s had error: %s",s_tafile,
  1686. mstrerror(errno));
  1687. close(fd);
  1688. g_errno = errno;
  1689. return false;
  1690. }
  1691. close(fd);
  1692. // parse the text line
  1693. int64_t stampTime = 0LL;
  1694. int64_t clockAdj = 0LL;
  1695. sscanf ( rbuf , "%"UINT64" %"INT64"", &stampTime, &clockAdj );
  1696. // get stamp age
  1697. int64_t local = gettimeofdayInMillisecondsLocal();
  1698. int64_t stampAge = local - stampTime;
  1699. // if too old forget about it
  1700. if ( stampAge > 2*86400 ) return true;
  1701. // update adjustment
  1702. s_adjustment = clockAdj;
  1703. // if stamp in file is within 2 days old, assume its still good
  1704. // this will prevent having to rebuild a sortbydatetable
  1705. // and really slow down loadups
  1706. g_clockInSync = true;
  1707. // note it
  1708. log("util: loaded %s and put clock in sync. age=%"UINT64" adj=%"INT64"",
  1709. s_tafile,stampAge,clockAdj);
  1710. return true;
  1711. }
  1712. // . returns false and sets g_errno on error
  1713. // . saved by Process::saveBlockingFiles1()
  1714. bool saveTimeAdjustment ( ) {
  1715. // fortget it if setTimeAdjustmentFilename never called
  1716. if ( ! s_hasFileName ) return true;
  1717. // must be in sync!
  1718. if ( ! g_clockInSync ) return true;
  1719. // store it
  1720. int64_t local = gettimeofdayInMillisecondsLocal();
  1721. char wbuf[1024];
  1722. sprintf (wbuf,"%"UINT64" %"INT64"\n",local,s_adjustment);
  1723. // write it out
  1724. int fd = open ( s_tafile , O_CREAT|O_RDWR|O_TRUNC , 00666 );
  1725. if ( fd < 0 ) {
  1726. log("util: could not open %s for writing",s_tafile);
  1727. g_errno = errno;
  1728. return false;
  1729. }
  1730. // how many bytes to write?
  1731. int32_t len = gbstrlen(wbuf);
  1732. // read in max bytes
  1733. int nw = write ( fd , wbuf , len );
  1734. if ( nw != len ) {
  1735. log("util: writing %s had error: %s",s_tafile,
  1736. mstrerror(errno));
  1737. close(fd);
  1738. g_errno = errno;
  1739. return false;
  1740. }
  1741. close(fd);
  1742. // note it
  1743. log("util: saved %s",s_tafile);
  1744. // it was written ok
  1745. return true;
  1746. }
  1747. // a "fake" settimeofdayInMilliseconds()
  1748. void settimeofdayInMillisecondsGlobal ( int64_t newTime ) {
  1749. // can't do this in sig handler
  1750. if ( g_inSigHandler ) return;
  1751. // this isn't async signal safe...
  1752. struct timeval tv;
  1753. gettimeofday ( &tv , NULL );
  1754. int64_t now=(int64_t)(tv.tv_usec/1000)+((int64_t)tv.tv_sec)*1000;
  1755. // bail if no change... UNLESS we need to sync clock!!
  1756. if ( s_adjustment == newTime - now && g_clockInSync ) return;
  1757. // log it, that way we know if there is another issue
  1758. // with flip-flopping (before we synced with host #0 and also
  1759. // with proxy #0)
  1760. int64_t delta = s_adjustment - (newTime - now) ;
  1761. if ( delta > 100 || delta < -100 )
  1762. logf(LOG_INFO,"gb: Updating clock adjustment from "
  1763. "%"INT64" ms to %"INT64" ms", s_adjustment , newTime - now );
  1764. // set adjustment
  1765. s_adjustment = newTime - now;
  1766. // return?
  1767. if ( g_clockInSync ) return;
  1768. // we are now in sync
  1769. g_clockInSync = true;
  1770. // log it
  1771. if ( s_hasFileName )
  1772. logf(LOG_INFO,"gb: clock is now synced with host #0. "
  1773. "saving to %s",s_tafile);
  1774. else
  1775. logf(LOG_INFO,"gb: clock is now synced with host #0.");
  1776. // save
  1777. saveTimeAdjustment();
  1778. // force timedb to load now!
  1779. //initAllSortByDateTables ( );
  1780. }
  1781. time_t getTimeGlobal() {
  1782. return gettimeofdayInMillisecondsSynced() / 1000;
  1783. }
  1784. time_t getTimeGlobalNoCore() {
  1785. return gettimeofdayInMillisecondsGlobalNoCore() / 1000;
  1786. }
  1787. time_t getTimeSynced() {
  1788. return gettimeofdayInMillisecondsSynced() / 1000;
  1789. }
  1790. int64_t gettimeofdayInMillisecondsGlobal() {
  1791. return gettimeofdayInMillisecondsSynced();
  1792. }
  1793. #include "Threads.h"
  1794. int64_t gettimeofdayInMillisecondsSynced() {
  1795. // if in a sig handler then return g_now
  1796. //if ( g_inSigHandler ) return g_nowGlobal;
  1797. // i find that a pthread can call this function even though
  1798. // a signal handler is underway in the main thread!
  1799. if ( g_inSigHandler && ! g_threads.amThread() ) {
  1800. char *xx = NULL; *xx = 0; }
  1801. // sanity check
  1802. if ( ! isClockInSync() ) {
  1803. static int s_printed = 0;
  1804. if ( (s_printed % 100) == 0 ) {
  1805. s_printed++;
  1806. log("xml: clock not in sync with host #0 yet!!!!!!");
  1807. }
  1808. //char *xx = NULL; *xx = 0; }
  1809. }
  1810. int64_t now;
  1811. // the real tiem sigalrm interrupt in Loop.cpp sets this to
  1812. // true once per millisecond
  1813. if ( ! g_clockNeedsUpdate ) {
  1814. now = g_now;
  1815. }
  1816. else {
  1817. //if ( ! g_clockInSync )
  1818. // log("gb: Getting global time but clock not in sync.");
  1819. // this isn't async signal safe...
  1820. struct timeval tv;
  1821. gettimeofday ( &tv , NULL );
  1822. now = (int64_t)(tv.tv_usec/1000)+((int64_t)tv.tv_sec)*1000;
  1823. }
  1824. // update g_nowLocal
  1825. if ( now > g_now ) g_now = now;
  1826. g_clockNeedsUpdate = false;
  1827. // adjust from Msg0x11 time adjustments
  1828. now += s_adjustment;
  1829. // update g_now if it is more accurate
  1830. //if ( now > g_nowGlobal ) g_nowGlobal = now;
  1831. return now;
  1832. }
  1833. int64_t gettimeofdayInMillisecondsGlobalNoCore() {
  1834. // if in a sig handler then return g_now
  1835. //if ( g_inSigHandler ) return g_nowGlobal;
  1836. // i find that a pthread can call this function even though
  1837. // a signal handler is underway in the main thread!
  1838. if ( g_inSigHandler && ! g_threads.amThread() ) {
  1839. char *xx = NULL; *xx = 0; }
  1840. // sanity check
  1841. //if ( ! g_clockInSync ) { char *xx = NULL; *xx = 0; }
  1842. //if ( ! g_clockInSync )
  1843. // log("gb: Getting global time but clock not in sync.");
  1844. // this isn't async signal safe...
  1845. struct timeval tv;
  1846. gettimeofday ( &tv , NULL );
  1847. int64_t now=(int64_t)(tv.tv_usec/1000)+((int64_t)tv.tv_sec)*1000;
  1848. // update g_nowLocal
  1849. if ( now > g_now ) g_now = now;
  1850. // adjust from Msg0x11 time adjustments
  1851. now += s_adjustment;
  1852. // update g_now if it is more accurate
  1853. //if ( now > g_nowGlobal ) g_nowGlobal = now;
  1854. return now;
  1855. }
  1856. int64_t gettimeofdayInMillisecondsLocal() {
  1857. return gettimeofdayInMilliseconds();
  1858. }
  1859. uint64_t gettimeofdayInMicroseconds(void) {
  1860. struct timeval tv;
  1861. gettimeofday(&tv, NULL);
  1862. return(((uint64_t)tv.tv_sec * 1000000LL) + (uint64_t)tv.tv_usec);
  1863. }
  1864. // "local" means the time on this machine itself, NOT a timezone thing.
  1865. int64_t gettimeofdayInMilliseconds() {
  1866. // if in a sig handler then return g_now
  1867. //if ( g_inSigHandler ) return g_now;
  1868. // i find that a pthread can call this function even though
  1869. // a signal handler is underway in the main thread!
  1870. if ( g_inSigHandler && ! g_threads.amThread() ) {
  1871. char *xx = NULL; *xx = 0; }
  1872. // the real tiem sigalrm interrupt in Loop.cpp sets this to
  1873. // true once per millisecond
  1874. if ( ! g_clockNeedsUpdate )
  1875. return g_now;
  1876. g_clockNeedsUpdate = false;
  1877. // this isn't async signal safe...
  1878. struct timeval tv;
  1879. //g_loop.disableTimer();
  1880. gettimeofday ( &tv , NULL );
  1881. //g_loop.enableTimer();
  1882. int64_t now=(int64_t)(tv.tv_usec/1000)+((int64_t)tv.tv_sec)*1000;
  1883. // update g_nowLocal
  1884. if ( now > g_now ) g_now = now;
  1885. // adjust from Msg0x11 time adjustments
  1886. //now += s_adjustment;
  1887. // update g_now if it is more accurate
  1888. // . or don't, bad to update it here because it could be very different
  1889. // from what it should be
  1890. //if ( now > g_now ) g_now = now;
  1891. return now;
  1892. }
  1893. int64_t gettimeofdayInMilliseconds_force ( ) {
  1894. g_clockNeedsUpdate = true;
  1895. return gettimeofdayInMilliseconds();
  1896. }
  1897. time_t getTime () {
  1898. return getTimeLocal();
  1899. }
  1900. // . get time in seconds
  1901. // . use this instead of call to time(NULL) cuz it uses adjustment
  1902. time_t getTimeLocal () {
  1903. // if in a sig handler then return g_now/1000
  1904. //if ( g_inSigHandler ) return (time_t)(g_now / 1000);
  1905. // i find that a pthread can call this function even though
  1906. // a signal handler is underway in the main thread!
  1907. if ( g_inSigHandler && ! g_threads.amThread() ) {
  1908. char *xx = NULL; *xx = 0; }
  1909. // get time now
  1910. uint32_t now = gettimeofdayInMilliseconds() / 1000;
  1911. // and adjust it
  1912. //now += s_adjustment / 1000;
  1913. return (time_t)now;
  1914. }
  1915. // . make it so we can display the ascii string on an html browser
  1916. int32_t saftenTags2 ( char *s , int32_t slen , char *t , int32_t tlen ) {
  1917. char *start = s ;
  1918. // bail if slen is 0
  1919. if ( slen <= 0 ) return 0;
  1920. // leave a char for the \0
  1921. char *send = s + slen - 1;
  1922. char *tend = t + tlen;
  1923. for ( ; t < tend && s + 6 < send ; t++ ) {
  1924. if ( *t == '<' ) {
  1925. *s++ = '&';
  1926. *s++ = 'l';
  1927. *s++ = 't';
  1928. *s++ = ';';
  1929. continue;
  1930. }
  1931. if ( *t == '>' ) {
  1932. *s++ = '&';
  1933. *s++ = 'g';
  1934. *s++ = 't';
  1935. *s++ = ';';
  1936. continue;
  1937. }
  1938. if ( *t == '&' ) {
  1939. *s++ = '&';
  1940. *s++ = 'a';
  1941. *s++ = 'm';
  1942. *s++ = 'p';
  1943. *s++ = ';';
  1944. continue;
  1945. }
  1946. *s++ = *t;
  1947. }
  1948. // return NULL if we broke out because there was not enough room
  1949. //if ( s + 6 >= send ) return NULL;
  1950. // NULL terminate "s"
  1951. *s = '\0';
  1952. // return # of bytes, excluding \0, stored into s
  1953. return s - start;
  1954. }
  1955. void getCalendarFromMs(int64_t ms,
  1956. int32_t* days,
  1957. int32_t* hours,
  1958. int32_t* minutes,
  1959. int32_t* secs,
  1960. int32_t* msecs) {
  1961. int32_t s = 1000;
  1962. int32_t m = s * 60;
  1963. int32_t h = m * 60;
  1964. int32_t d = h * 24;
  1965. *days = ms / d;
  1966. int64_t tmp = ms % d;
  1967. *hours = tmp / h;
  1968. tmp = tmp % h;
  1969. *minutes = tmp / m;
  1970. tmp = tmp % m;
  1971. *secs = tmp / s;
  1972. *msecs = tmp % s;
  1973. }
  1974. uint32_t calculateChecksum(char *buf, int32_t bufLen){
  1975. uint32_t sum = 0;
  1976. for(int32_t i = 0; i < bufLen>>2;i++)
  1977. sum += ((uint32_t*)buf)[i];
  1978. return sum;
  1979. }
  1980. bool anchorIsLink( char *tag, int32_t tagLen){
  1981. if (strncasestr(tag, tagLen, "href")) return true;
  1982. if (strncasestr(tag, tagLen, "onclick")) return true;
  1983. return false;
  1984. }
  1985. bool has_alpha_a ( char *s , char *send ) {
  1986. for ( ; s < send ; s++ )
  1987. if (is_alpha_a(*s)) return true;
  1988. return false;
  1989. }
  1990. bool has_alpha_utf8 ( char *s , char *send ) {
  1991. char cs = 0;
  1992. for ( ; s < send ; s += cs ) {
  1993. cs = getUtf8CharSize ( s );
  1994. if ( cs == 1 ) {
  1995. if (is_alpha_a(*s)) return true;
  1996. continue;
  1997. }
  1998. if ( is_alpha_utf8(s) ) return true;
  1999. }
  2000. return false;
  2001. }
  2002. //takes an input skips leading spaces
  2003. //puts next nonspace char* in numPtr
  2004. //an returns the next space after that.
  2005. char* getNextNum(char* input, char** numPtr) {
  2006. char* p = input;
  2007. char* nextspace;
  2008. while(*p && isspace(*p)) p++;
  2009. nextspace = p;
  2010. *numPtr = p;
  2011. while(*nextspace && !isspace(*nextspace))
  2012. nextspace++;
  2013. return nextspace;
  2014. }
  2015. #include "HttpMime.h" // CT_HTML
  2016. // returns length of stripped content, but will set g_errno and return -1
  2017. // on error
  2018. int32_t stripHtml( char *content, int32_t contentLen, int32_t version, int32_t strip ) {
  2019. if ( !strip ) {
  2020. log( LOG_WARN, "query: html stripping not required!" );
  2021. return contentLen;
  2022. }
  2023. if ( ! content )
  2024. return 0;
  2025. if ( contentLen == 0 )
  2026. return 0;
  2027. // filter content if we should
  2028. // keep this on the big stack so "content" still references something
  2029. Xml tmpXml;
  2030. // . get the content as xhtml (should be NULL terminated)
  2031. // . parse as utf8 since all we are doing is messing with
  2032. // the tags...content manipulation comes later
  2033. if ( ! tmpXml.set ( content , contentLen,
  2034. false, 0, false, version , true , 0 , CT_HTML ) )
  2035. return -1;
  2036. //if( strip == 4 )
  2037. // return tmpXml.getText( content, contentLen );
  2038. // go tag by tag
  2039. int32_t n = tmpXml.getNumNodes();
  2040. XmlNode *nodes = tmpXml.getNodes();
  2041. // Xml class may have converted to utf16
  2042. content = tmpXml.getContent();
  2043. contentLen = tmpXml.getContentLen();
  2044. char *x = content;
  2045. char *xend = content + contentLen;
  2046. int32_t stackid = -1;
  2047. int32_t stackc = 0;
  2048. char skipIt = 0;
  2049. // . hack COL tag to NOT require a back tag
  2050. // . do not leave it that way as it could mess up our parsing
  2051. //g_nodes[25].m_hasBackTag = 0;
  2052. for ( int32_t i = 0 ; i < n ; i++ ) {
  2053. // get id of this node
  2054. int32_t id = nodes[i].m_nodeId;
  2055. // if strip is 4, just remove the script tag
  2056. if( strip == 4 ){
  2057. if ( id ){
  2058. if ( id == 83 ){
  2059. skipIt ^= 1;
  2060. continue;
  2061. }
  2062. }
  2063. else if ( skipIt ) continue;
  2064. goto keepit;
  2065. }
  2066. // if strip is 3, ALL tags will be removed!
  2067. if( strip == 3 ) {
  2068. if( id ) {
  2069. // . we dont want anything in between:
  2070. // - script tags (83)
  2071. // - style tags (111)
  2072. if ((id == 83) || (id == 111)) skipIt ^= 1;
  2073. // save img to have alt text kept.
  2074. if ( id == 54 ) goto keepit;
  2075. continue;
  2076. }
  2077. else {
  2078. if( skipIt ) continue;
  2079. goto keepit;
  2080. }
  2081. }
  2082. // get it
  2083. int32_t fk;
  2084. if ( strip == 1 ) fk = g_nodes[id].m_filterKeep1;
  2085. else fk = g_nodes[id].m_filterKeep2;
  2086. // if tag is <link ...> only keep it if it has
  2087. // rel="stylesheet" or rel=stylesheet
  2088. if ( strip == 2 && id == 62 ) { // <link> tag id
  2089. int32_t fflen;
  2090. char *ff = nodes[i].getFieldValue ( "rel" , &fflen );
  2091. if ( ff && fflen == 10 &&
  2092. strncmp(ff,"stylesheet",10) == 0 )
  2093. goto keepit;
  2094. }
  2095. // just remove just the tag if this is 2
  2096. if ( fk == 2 ) continue;
  2097. // keep it if not in a stack
  2098. if ( ! stackc && fk ) goto keepit;
  2099. // if no front/back for tag, just skip it
  2100. if ( ! nodes[i].m_hasBackTag ) continue;
  2101. // start stack if none
  2102. if ( stackc == 0 ) {
  2103. // but not if this is a back tag
  2104. if ( nodes[i].m_node[1] == '/' ) continue;
  2105. // now start the stack
  2106. stackid = id;
  2107. stackc = 1;
  2108. continue;
  2109. }
  2110. // skip if this tag does not match what is on stack
  2111. if ( id != stackid ) continue;
  2112. // if ANOTHER front tag, inc stack
  2113. if ( nodes[i].m_node[1] != '/' ) stackc++;
  2114. // otherwise, dec the stack count
  2115. else stackc--;
  2116. // . ensure not negative from excess back tags
  2117. // . reset stackid to -1 to indicate no stack
  2118. if ( stackc <= 0 ) { stackid= -1; stackc = 0; }
  2119. // skip it
  2120. continue;
  2121. keepit:
  2122. // replace images with their alt text
  2123. int32_t vlen;
  2124. char *v;
  2125. if ( id == 54 ) {
  2126. v = nodes[i].getFieldValue("alt", &vlen );
  2127. // try title if no alt text
  2128. if ( ! v )
  2129. v = nodes[i].getFieldValue("title", &vlen );
  2130. if ( v ) { gbmemcpy ( x, v, vlen ); x += vlen; }
  2131. continue;
  2132. }
  2133. // remove background image from body,table,td tags
  2134. if ( id == 19 || id == 93 || id == 95 ) {
  2135. v = nodes[i].getFieldValue("background", &vlen);
  2136. // remove background, just sabotage it
  2137. if ( v ) v[-4] = 'x';
  2138. }
  2139. // store it
  2140. gbmemcpy ( x , nodes[i].m_node , nodes[i].m_nodeLen );
  2141. x += nodes[i].m_nodeLen;
  2142. // sanity check
  2143. if ( x > xend ) { char *xx=NULL;*xx=0;}
  2144. }
  2145. contentLen = x - content;
  2146. content [ contentLen ] = '\0';
  2147. // unhack COL tag
  2148. //g_nodes[25].m_hasBackTag = 1;
  2149. return contentLen;
  2150. }
  2151. bool is_urlchar(char s) {
  2152. // [a-z0-9/:_-.?$,~=#&%+@]
  2153. if(isalnum(s)) return true;
  2154. if(s == '/' ||
  2155. s == ':' ||
  2156. s == '_' ||
  2157. s == '-' ||
  2158. s == '.' ||
  2159. s == '?' ||
  2160. s == '$' ||
  2161. s == ',' ||
  2162. s == '~' ||
  2163. s == '=' ||
  2164. s == '#' ||
  2165. s == '&' ||
  2166. s == '%' ||
  2167. s == '+' ||
  2168. s == '@') return true;
  2169. return false;
  2170. }
  2171. // don't allow "> in our input boxes
  2172. int32_t cleanInput(char *outbuf, int32_t outbufSize, char *inbuf, int32_t inbufLen){
  2173. char *p = outbuf;
  2174. int32_t numQuotes=0;
  2175. int32_t lastQuote = 0;
  2176. for (int32_t i=0;i<inbufLen;i++){
  2177. if (p-outbuf >= outbufSize-1) break;
  2178. if (inbuf[i] == '"'){
  2179. numQuotes++;
  2180. lastQuote = i;
  2181. }
  2182. // if we have an odd number of quotes and a close angle bracket
  2183. // it could be an xss attempt
  2184. if (inbuf[i] == '>' && (numQuotes & 1)) {
  2185. p = outbuf+lastQuote;
  2186. break;
  2187. }
  2188. *p = inbuf[i];
  2189. p++;
  2190. }
  2191. *p = '\0';
  2192. return p-outbuf;
  2193. }
  2194. //
  2195. // get rid of the virtual Msg class because it screws up how we
  2196. // serialize/deserialize every time we compile gb it seems
  2197. //
  2198. int32_t getMsgStoredSize ( int32_t baseSize,
  2199. int32_t *firstSizeParm,
  2200. int32_t *lastSizeParm ) {
  2201. //int32_t size = (int32_t)sizeof(Msg);
  2202. int32_t size = baseSize;//getBaseSize();
  2203. // add up string buffer sizes
  2204. int32_t *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
  2205. int32_t *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displayMeta
  2206. for ( ; sizePtr <= sizeEnd ; sizePtr++ )
  2207. size += *sizePtr;
  2208. return size;
  2209. }
  2210. // . return ptr to the buffer we serialize into
  2211. // . return NULL and set g_errno on error
  2212. char *serializeMsg ( int32_t baseSize ,
  2213. int32_t *firstSizeParm ,
  2214. int32_t *lastSizeParm ,
  2215. char **firstStrPtr ,
  2216. void *thisPtr ,
  2217. int32_t *retSize ,
  2218. char *userBuf ,
  2219. int32_t userBufSize ,
  2220. bool makePtrsRefNewBuf ) {
  2221. // make a buffer to serialize into
  2222. char *buf = NULL;
  2223. //int32_t need = getStoredSize();
  2224. int32_t need = getMsgStoredSize(baseSize,firstSizeParm,lastSizeParm);
  2225. // big enough?
  2226. if ( need <= userBufSize ) buf = userBuf;
  2227. // alloc if we should
  2228. if ( ! buf ) buf = (char *)mmalloc ( need , "Ra" );
  2229. // bail on error, g_errno should be set
  2230. if ( ! buf ) return NULL;
  2231. // set how many bytes we will serialize into
  2232. *retSize = need;
  2233. // copy the easy stuff
  2234. char *p = buf;
  2235. gbmemcpy ( p , (char *)thisPtr , baseSize );//getBaseSize() );
  2236. p += baseSize; // getBaseSize();
  2237. // then store the strings!
  2238. int32_t *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
  2239. int32_t *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displayMet
  2240. char **strPtr = firstStrPtr;//getFirstStrPtr (); // &ptr_qbuf;
  2241. for ( ; sizePtr <= sizeEnd ; ) {
  2242. // if we are NULL, we are a "bookmark", so
  2243. // we alloc'd space for it, but don't copy into
  2244. // the space until after this call toe serialize()
  2245. if ( ! *strPtr ) goto skip;
  2246. // sanity check -- cannot copy onto ourselves
  2247. if ( p > *strPtr && p < *strPtr + *sizePtr ) {
  2248. char *xx = NULL; *xx = 0; }
  2249. // copy the string into the buffer
  2250. gbmemcpy ( p , *strPtr , *sizePtr );
  2251. skip:
  2252. // . make it point into the buffer now
  2253. // . MDW: why? that is causing problems for the re-call in
  2254. // Msg3a, it calls this twice with the same "m_r"
  2255. if ( makePtrsRefNewBuf ) *strPtr = p;
  2256. // advance our destination ptr
  2257. p += *sizePtr;
  2258. // advance both ptrs to next string
  2259. sizePtr++;
  2260. strPtr++;
  2261. }
  2262. return buf;
  2263. }
  2264. char *serializeMsg2 ( void *thisPtr ,
  2265. int32_t objSize ,
  2266. char **firstStrPtr ,
  2267. int32_t *firstSizeParm ,
  2268. int32_t *retSize ) {
  2269. // make a buffer to serialize into
  2270. char *buf = NULL;
  2271. int32_t baseSize = (char *)firstStrPtr - (char *)thisPtr;
  2272. int nptrs=((char *)firstSizeParm-(char *)firstStrPtr)/sizeof(char *);
  2273. int32_t need = baseSize;
  2274. need += nptrs * sizeof(char *);
  2275. need += nptrs * sizeof(int32_t);
  2276. // tally up the string sizes
  2277. int32_t *srcSizePtr = (int32_t *)firstSizeParm;
  2278. char **srcStrPtr = (char **)firstStrPtr;
  2279. int32_t totalStringSizes = 0;
  2280. for ( int i = 0 ; i < nptrs ; i++ ) {
  2281. if ( srcStrPtr[i] == NULL ) continue;
  2282. totalStringSizes += srcSizePtr[i];
  2283. }
  2284. int32_t stringBufferOffset = need;
  2285. need += totalStringSizes;
  2286. // alloc if we should
  2287. if ( ! buf ) buf = (char *)mmalloc ( need , "sm2" );
  2288. // bail on error, g_errno should be set
  2289. if ( ! buf ) return NULL;
  2290. // set how many bytes we will serialize into
  2291. *retSize = need;
  2292. // copy everything over except strings themselves
  2293. char *p = buf;
  2294. gbmemcpy ( p , (char *)thisPtr , stringBufferOffset );//need );
  2295. // point to the string buffer
  2296. p += stringBufferOffset;
  2297. // then store the strings!
  2298. char **dstStrPtr = (char **)(buf + baseSize );
  2299. int32_t *dstSizePtr = (int32_t *)(buf + baseSize+sizeof(char *)*nptrs);
  2300. for ( int count = 0 ; count < nptrs ; count++ ) {
  2301. // copy ptrs
  2302. //*dstStrPtr = *srcStrPtr;
  2303. //*dstSizePtr = *srcSizePtr;
  2304. // if we are NULL, we are a "bookmark", so
  2305. // we alloc'd space for it, but don't copy into
  2306. // the space until after this call toe serialize()
  2307. if ( ! *srcStrPtr )
  2308. goto skip;
  2309. // if this is valid then size can't be 0! fix upstream.
  2310. if ( ! *srcSizePtr ) { char *xx=NULL;*xx=0; }
  2311. // if size is 0 use gbstrlen. helps with InjectionRequest
  2312. // where we set ptr_url or ptr_content but not size_url, etc.
  2313. //if ( ! *srcSizePtr )
  2314. // *srcSizePtr = gbstrlen(*strPtr);
  2315. // sanity check -- cannot copy onto ourselves
  2316. if ( p > *srcStrPtr && p < *srcStrPtr + *srcSizePtr ) {
  2317. char *xx = NULL; *xx = 0; }
  2318. // copy the string into the buffer
  2319. gbmemcpy ( p , *srcStrPtr , *srcSizePtr );
  2320. skip:
  2321. // point it now into the string buffer
  2322. *dstStrPtr = p;
  2323. // if it is 0 length, make ptr NULL in destination
  2324. if ( *srcSizePtr == 0 || *srcStrPtr == NULL ) {
  2325. *dstStrPtr = NULL;
  2326. *dstSizePtr = 0;
  2327. }
  2328. // advance our destination ptr
  2329. p += *dstSizePtr;
  2330. // advance both ptrs to next string
  2331. srcSizePtr++;
  2332. srcStrPtr++;
  2333. dstSizePtr++;
  2334. dstStrPtr++;
  2335. }
  2336. return buf;
  2337. }
  2338. // convert offsets back into ptrs
  2339. int32_t deserializeMsg ( int32_t baseSize ,
  2340. int32_t *firstSizeParm ,
  2341. int32_t *lastSizeParm ,
  2342. char **firstStrPtr ,
  2343. char *stringBuf ) {
  2344. // point to our string buffer
  2345. char *p = stringBuf;//getStringBuf(); // m_buf;
  2346. // then store the strings!
  2347. int32_t *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
  2348. int32_t *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displayMet
  2349. char **strPtr = firstStrPtr;//getFirstStrPtr (); // &ptr_qbuf;
  2350. for ( ; sizePtr <= sizeEnd ; ) {
  2351. // convert the offset to a ptr
  2352. *strPtr = p;
  2353. // make it NULL if size is 0 though
  2354. if ( *sizePtr == 0 ) *strPtr = NULL;
  2355. // sanity check
  2356. if ( *sizePtr < 0 ) { g_errno = ECORRUPTDATA; return -1;}
  2357. // advance our destination ptr
  2358. p += *sizePtr;
  2359. // advance both ptrs to next string
  2360. sizePtr++;
  2361. strPtr++;
  2362. }
  2363. // return how many bytes we processed
  2364. return baseSize + (p - stringBuf);//getStringBuf());
  2365. }
  2366. bool deserializeMsg2 ( char **firstStrPtr , // ptr_url
  2367. int32_t *firstSizeParm ) { // size_url
  2368. int nptrs=((char *)firstSizeParm-(char *)firstStrPtr)/sizeof(char *);
  2369. // point to our string buffer
  2370. char *p = ((char *)firstSizeParm + sizeof(int32_t)*nptrs);
  2371. // then store the strings!
  2372. int32_t *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
  2373. //int32_t *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displ
  2374. char **strPtr = firstStrPtr;//getFirstStrPtr (); // &ptr_qbuf;
  2375. int count = 0;
  2376. for ( ; count < nptrs ; count++ ) { // sizePtr <= sizeEnd ; ) {
  2377. // convert the offset to a ptr
  2378. *strPtr = p;
  2379. // make it NULL if size is 0 though
  2380. if ( *sizePtr == 0 ) *strPtr = NULL;
  2381. // sanity check
  2382. if ( *sizePtr < 0 ) return false;//{ char *xx = NULL; *xx =0; }
  2383. // advance our destination ptr
  2384. p += *sizePtr;
  2385. // advance both ptrs to next string
  2386. sizePtr++;
  2387. strPtr++;
  2388. }
  2389. // return how many bytes we processed
  2390. //return baseSize + (p - stringBuf);//getStringBuf());
  2391. return true;
  2392. }
  2393. // print it to stdout for debugging Dates.cpp
  2394. int32_t printTime ( time_t ttt ) {
  2395. //char *s = ctime(&ttt);
  2396. // print in UTC!
  2397. char *s = asctime ( gmtime(&ttt) );
  2398. // strip \n
  2399. s[gbstrlen(s)-1] = '\0';
  2400. fprintf(stderr,"%s UTC\n",s);
  2401. return 0;
  2402. }
  2403. // this uses our local timezone which is MST, so we need to tell
  2404. // it to use UTC somehow...
  2405. time_t mktime_utc ( struct tm *ttt ) {
  2406. time_t local = mktime ( ttt );
  2407. // bad?
  2408. if ( local < 0 ) return local;
  2409. /*
  2410. // sanity check
  2411. static char s_mm = 1;
  2412. static int32_t s_localOff;
  2413. if ( s_mm ) {
  2414. s_mm = 0;
  2415. struct tm ff;
  2416. ff.tm_mon = 0;
  2417. ff.tm_year = 70;
  2418. ff.tm_mday = 1;
  2419. ff.tm_hour = 0;
  2420. ff.tm_min = 0;
  2421. ff.tm_sec = 0;
  2422. int32_t qq = mktime ( &ff );
  2423. //fprintf(stderr,"qq=%"INT32"\n",qq);
  2424. // . set this then
  2425. // . we subtract s_localOff to further mktime() returns to
  2426. // get it into utc
  2427. s_localOff = qq;
  2428. // sanity
  2429. if ( s_localOff != timezone ) { char *xx=NULL;*xx=0; }
  2430. }
  2431. */
  2432. // see what our timezone is!
  2433. //fprintf(stderr,"%"INT32"=tz\n",timezone);
  2434. // mod that
  2435. return local - timezone;
  2436. }
  2437. bool verifyUtf8 ( char *txt , int32_t tlen ) {
  2438. if ( ! txt || tlen <= 0 ) return true;
  2439. char size;
  2440. char *p = txt;
  2441. char *pend = txt + tlen;
  2442. for ( ; p < pend ; p += size ) {
  2443. size = getUtf8CharSize(p);
  2444. // skip if ascii
  2445. if ( ! (p[0] & 0x80) ) continue;
  2446. // ok, it's a utf8 char, it must have both hi bits set
  2447. if ( (p[0] & 0xc0) != 0xc0 ) return false;
  2448. // if only one byte, we are done.. how can that be?
  2449. if ( size == 1 ) return false;
  2450. //if ( ! utf8IsSane ( p[0] ) ) return false;
  2451. // successive utf8 chars must have & 0xc0 be equal to 0x80
  2452. // but the first char it must equal 0xc0, both set
  2453. if ( (p[1] & 0xc0) != 0x80 ) return false;
  2454. if ( size == 2 ) continue;
  2455. if ( (p[2] & 0xc0) != 0x80 ) return false;
  2456. if ( size == 3 ) continue;
  2457. if ( (p[3] & 0xc0) != 0x80 ) return false;
  2458. }
  2459. if ( p != pend ) return false;
  2460. return true;
  2461. }
  2462. bool verifyUtf8 ( char *txt ) {
  2463. int32_t tlen = gbstrlen(txt);
  2464. return verifyUtf8(txt,tlen);
  2465. }