PageRenderTime 68ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/src/sphinxjson.cpp

https://bitbucket.org/ecairn/sphinx-official
C++ | 1159 lines | 1019 code | 93 blank | 47 comment | 87 complexity | b8e8b75cca260aae436996c19e7cba9b MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.0
  1. //
  2. // $Id$
  3. //
  4. //
  5. // Copyright (c) 2011-2015, Andrew Aksyonoff
  6. // Copyright (c) 2011-2015, Sphinx Technologies Inc
  7. // All rights reserved
  8. //
  9. // This program is free software; you can redistribute it and/or modify
  10. // it under the terms of the GNU General Public License. You should have
  11. // received a copy of the GPL license along with this program; if you
  12. // did not, you can find it at http://www.gnu.org/
  13. //
  14. #include "sphinxjson.h"
  15. #include "sphinxint.h"
  16. #if USE_WINDOWS
  17. #include <io.h> // for isatty() in llsphinxjson.c
  18. #endif
  19. //////////////////////////////////////////////////////////////////////////
  20. /// parser view on a generic node
  21. struct JsonNode_t
  22. {
  23. ESphJsonType m_eType; ///< node type
  24. int64_t m_iValue; ///< integer value, only used for JSON_INT32 and JSON_INT64
  25. double m_fValue; ///< floating point value, only used for JSON_DOUBLE
  26. int m_iStart; ///< string value, start index (inclusive) into m_pBuf, only used for JSON_STRING
  27. int m_iEnd; ///< string value, end index (exclusive) into m_pBuf, only used for JSON_STRING
  28. int m_iHandle; ///< subobject value, index into m_dNodes storage
  29. int m_iKeyStart; ///< node name, start index (inclusive) into m_pBuf
  30. int m_iKeyEnd; ///< node name, end index (exclusive) into m_pBuf
  31. JsonNode_t ()
  32. : m_eType ( JSON_TOTAL )
  33. {}
  34. };
  35. #define YYSTYPE JsonNode_t
  36. // must be included after YYSTYPE declaration
  37. #include "yysphinxjson.h"
  38. /// actually, JSON-to-SphinxBSON converter helper, but who cares
  39. class JsonParser_c : ISphNoncopyable
  40. {
  41. public:
  42. void * m_pScanner;
  43. const char * m_pLastToken;
  44. CSphVector<BYTE> & m_dBuffer;
  45. CSphString & m_sError;
  46. bool m_bAutoconv;
  47. bool m_bToLowercase;
  48. char * m_pBuf;
  49. CSphVector < CSphVector<JsonNode_t> > m_dNodes;
  50. CSphVector<JsonNode_t> m_dEmpty;
  51. public:
  52. JsonParser_c ( CSphVector<BYTE> & dBuffer, bool bAutoconv, bool bToLowercase, CSphString & sError )
  53. : m_pScanner ( NULL )
  54. , m_pLastToken ( NULL )
  55. , m_dBuffer ( dBuffer )
  56. , m_sError ( sError )
  57. , m_bAutoconv ( bAutoconv )
  58. , m_bToLowercase ( bToLowercase )
  59. {
  60. // reserve 4 bytes for Bloom mask
  61. StoreInt ( 0 );
  62. }
  63. protected:
  64. BYTE * BufAlloc ( int iLen )
  65. {
  66. int iPos = m_dBuffer.GetLength();
  67. m_dBuffer.Resize ( m_dBuffer.GetLength()+iLen );
  68. return m_dBuffer.Begin()+iPos;
  69. }
  70. void StoreInt ( int v )
  71. {
  72. BYTE * p = BufAlloc ( 4 );
  73. *p++ = BYTE(DWORD(v));
  74. *p++ = BYTE(DWORD(v) >> 8);
  75. *p++ = BYTE(DWORD(v) >> 16);
  76. *p++ = BYTE(DWORD(v) >> 24);
  77. }
  78. void StoreBigint ( int64_t v )
  79. {
  80. StoreInt ( (DWORD)( v & 0xffffffffUL ) );
  81. StoreInt ( (int)( v>>32 ) );
  82. }
  83. int PackLen ( DWORD v )
  84. {
  85. if ( v<=251 )
  86. return 1;
  87. else if ( v<65536 )
  88. return 3;
  89. else if ( v<16777216 )
  90. return 4;
  91. else
  92. return 5;
  93. }
  94. void PackInt ( DWORD v )
  95. {
  96. assert ( v<16777216 ); // strings over 16M bytes and arrays over 16M entries are not supported
  97. if ( v<252 )
  98. {
  99. m_dBuffer.Add ( BYTE(v) );
  100. } else if ( v<65536 )
  101. {
  102. m_dBuffer.Add ( 252 );
  103. m_dBuffer.Add ( BYTE ( v & 255 ) );
  104. m_dBuffer.Add ( BYTE ( v>>8 ) );
  105. } else
  106. {
  107. m_dBuffer.Add ( 253 );
  108. m_dBuffer.Add ( BYTE ( v & 255 ) );
  109. m_dBuffer.Add ( BYTE ( ( v>>8 ) & 255 ) );
  110. m_dBuffer.Add ( BYTE ( v>>16 ) );
  111. }
  112. }
  113. void PackStr ( const char * s, int iLen )
  114. {
  115. iLen = Min ( iLen, 0xffffff );
  116. PackInt ( iLen );
  117. if ( iLen )
  118. {
  119. BYTE * p = BufAlloc ( iLen );
  120. memcpy ( p, s, iLen );
  121. }
  122. }
  123. int JsonUnescape ( char ** pEscaped, int iLen )
  124. {
  125. assert ( pEscaped );
  126. char * s = *pEscaped;
  127. // skip heading and trailing quotes
  128. if ( ( s[0]=='\'' && s[iLen-1]=='\'' ) || ( s[0]=='"' && s[iLen-1]=='"' ) )
  129. {
  130. s++;
  131. iLen -= 2;
  132. }
  133. char * sMax = s+iLen;
  134. char * d = s;
  135. char * pStart = d;
  136. char sBuf[8] = { 0 };
  137. while ( s<sMax )
  138. {
  139. if ( s[0]=='\\' )
  140. {
  141. switch ( s[1] )
  142. {
  143. case 'b': *d++ = '\b'; break;
  144. case 'n': *d++ = '\n'; break;
  145. case 'r': *d++ = '\r'; break;
  146. case 't': *d++ = '\t'; break;
  147. case 'f': *d++ = '\f'; break; // formfeed (rfc 4627)
  148. case 'u':
  149. // convert 6-byte sequences \u four-hex-digits (rfc 4627) to UTF-8
  150. if ( s+6<=sMax && isxdigit ( s[2] ) && isxdigit ( s[3] ) && isxdigit ( s[4] ) && isxdigit ( s[5] ) )
  151. {
  152. memcpy ( sBuf, s+2, 4 );
  153. d += sphUTF8Encode ( (BYTE*)d, (int)strtol ( sBuf, NULL, 16 ) );
  154. s += 4;
  155. } else
  156. *d++ = s[1];
  157. break;
  158. default:
  159. *d++ = s[1];
  160. }
  161. s += 2;
  162. } else
  163. *d++ = *s++;
  164. }
  165. *pEscaped = pStart;
  166. return d - pStart;
  167. }
  168. void PackNodeStr ( const JsonNode_t & tNode )
  169. {
  170. int iLen = tNode.m_iEnd-tNode.m_iStart;
  171. char *s = m_pBuf + tNode.m_iStart;
  172. iLen = JsonUnescape ( &s, iLen );
  173. PackStr ( s, iLen );
  174. }
  175. int KeyUnescape ( char ** ppKey, int iLen )
  176. {
  177. char * s = *ppKey;
  178. iLen = JsonUnescape ( &s, iLen );
  179. if ( m_bToLowercase )
  180. for ( int i=0; i<iLen; i++ )
  181. s[i] = (char)tolower ( s[i] ); // OPTIMIZE! not sure if significant, but known to be hell slow
  182. *ppKey = s;
  183. return iLen;
  184. }
  185. void StoreMask ( int iOfs, DWORD uMask )
  186. {
  187. for ( int i=0; i<4; i++ )
  188. {
  189. m_dBuffer[iOfs+i] = BYTE ( uMask & 0xff );
  190. uMask >>= 8;
  191. }
  192. }
  193. /// reserve a single byte for a yet-unknown length, to be written later with PackSize()
  194. /// returns its offset, to be used by PackSize() to both calculate and stored the length
  195. int ReserveSize()
  196. {
  197. int iOfs = m_dBuffer.GetLength();
  198. m_dBuffer.Resize ( iOfs+1 );
  199. return iOfs;
  200. }
  201. /// compute current length from the offset reserved with ReserveSize(), and pack the value back there
  202. /// in most cases that single byte is enough; if not, we make room by memmove()ing the data
  203. void PackSize ( int iOfs )
  204. {
  205. int iSize = m_dBuffer.GetLength()-iOfs-1;
  206. int iPackLen = PackLen ( iSize );
  207. if ( iPackLen!=1 )
  208. {
  209. m_dBuffer.Resize ( iOfs+iPackLen+iSize );
  210. memmove ( m_dBuffer.Begin()+iOfs+iPackLen, m_dBuffer.Begin()+iOfs+1, iSize );
  211. }
  212. m_dBuffer.Resize ( iOfs );
  213. PackInt ( iSize );
  214. m_dBuffer.Resize ( iOfs+iPackLen+iSize );
  215. }
  216. public:
  217. void Finalize()
  218. {
  219. m_dBuffer.Add ( JSON_EOF );
  220. }
  221. void NumericFixup ( JsonNode_t & tNode )
  222. {
  223. // auto-convert string values, if necessary
  224. if ( tNode.m_eType==JSON_STRING && m_bAutoconv )
  225. if ( !sphJsonStringToNumber ( m_pBuf+tNode.m_iStart+1, tNode.m_iEnd-tNode.m_iStart-2, tNode.m_eType, tNode.m_iValue, tNode.m_fValue ) )
  226. return;
  227. // parser and converter emits int64 values, fix them up to int32
  228. if ( tNode.m_eType==JSON_INT64 )
  229. {
  230. int iVal = int(tNode.m_iValue);
  231. if ( tNode.m_iValue==int64_t(iVal) )
  232. tNode.m_eType = JSON_INT32;
  233. }
  234. }
  235. bool WriteNode ( JsonNode_t & tNode, const char * sKey=NULL, int iKeyLen=0 )
  236. {
  237. // convert int64 to int32, strings to numbers if needed
  238. NumericFixup ( tNode );
  239. ESphJsonType eType = tNode.m_eType;
  240. // note m_iHandle may be uninitialized on simple nodes
  241. CSphVector<JsonNode_t> & dNodes = ( ( eType==JSON_MIXED_VECTOR || eType==JSON_OBJECT ) && tNode.m_iHandle>=0 )
  242. ? m_dNodes[ tNode.m_iHandle ]
  243. : m_dEmpty;
  244. // process mixed vector, convert to generic vector if possible
  245. if ( eType==JSON_MIXED_VECTOR )
  246. {
  247. ARRAY_FOREACH ( i, dNodes )
  248. NumericFixup ( dNodes[i] );
  249. ESphJsonType eBase = dNodes.GetLength()>0 ? dNodes[0].m_eType : JSON_EOF;
  250. bool bGeneric = ARRAY_ALL ( bGeneric, dNodes, dNodes[_all].m_eType==eBase );
  251. if ( bGeneric )
  252. switch ( eBase )
  253. {
  254. case JSON_INT32: eType = JSON_INT32_VECTOR; break;
  255. case JSON_INT64: eType = JSON_INT64_VECTOR; break;
  256. case JSON_DOUBLE: eType = JSON_DOUBLE_VECTOR; break;
  257. case JSON_STRING: eType = JSON_STRING_VECTOR; break;
  258. default: break; // type matches across all entries, but we do not have a special format for that type
  259. }
  260. }
  261. // check for the root (bson v1), note sKey shouldn't be set
  262. if ( eType==JSON_OBJECT && m_dBuffer.GetLength()==4 && !sKey )
  263. eType = JSON_ROOT;
  264. // write node type
  265. if ( eType!=JSON_ROOT )
  266. m_dBuffer.Add ( (BYTE)eType );
  267. // write key if given
  268. if ( sKey )
  269. PackStr ( sKey, iKeyLen );
  270. switch ( eType )
  271. {
  272. // basic types
  273. case JSON_INT32: StoreInt ( (int)tNode.m_iValue ); break;
  274. case JSON_INT64: StoreBigint ( tNode.m_iValue ); break;
  275. case JSON_DOUBLE: StoreBigint ( sphD2QW ( tNode.m_fValue ) ); break;
  276. case JSON_STRING: PackNodeStr ( tNode ); break;
  277. // literals
  278. case JSON_TRUE:
  279. case JSON_FALSE:
  280. case JSON_NULL:
  281. // no content
  282. break;
  283. // associative arrays
  284. case JSON_ROOT:
  285. case JSON_OBJECT:
  286. {
  287. DWORD uMask = 0;
  288. int iOfs = 0;
  289. if ( eType==JSON_OBJECT )
  290. {
  291. iOfs = ReserveSize();
  292. StoreInt ( uMask );
  293. }
  294. ARRAY_FOREACH ( i, dNodes )
  295. {
  296. char * sObjKey = m_pBuf + dNodes[i].m_iKeyStart;
  297. int iLen = KeyUnescape ( &sObjKey, dNodes[i].m_iKeyEnd-dNodes[i].m_iKeyStart );
  298. WriteNode ( dNodes[i], sObjKey, iLen );
  299. uMask |= sphJsonKeyMask ( sObjKey, iLen );
  300. }
  301. m_dBuffer.Add ( JSON_EOF );
  302. if ( eType==JSON_OBJECT )
  303. {
  304. StoreMask ( iOfs+1, uMask );
  305. PackSize ( iOfs ); // MUST be in this order, because PackSize() might move the data!
  306. } else
  307. {
  308. assert ( eType==JSON_ROOT );
  309. StoreMask ( 0, uMask );
  310. }
  311. break;
  312. }
  313. // mixed array
  314. case JSON_MIXED_VECTOR:
  315. {
  316. int iOfs = ReserveSize();
  317. PackInt ( dNodes.GetLength() );
  318. ARRAY_FOREACH ( i, dNodes )
  319. WriteNode ( dNodes[i] );
  320. PackSize ( iOfs );
  321. break;
  322. }
  323. // optimized (generic) arrays
  324. case JSON_INT32_VECTOR:
  325. PackInt ( dNodes.GetLength() );
  326. ARRAY_FOREACH ( i, dNodes )
  327. StoreInt ( (int)dNodes[i].m_iValue );
  328. break;
  329. case JSON_INT64_VECTOR:
  330. PackInt ( dNodes.GetLength() );
  331. ARRAY_FOREACH ( i, dNodes )
  332. StoreBigint ( dNodes[i].m_iValue );
  333. break;
  334. case JSON_DOUBLE_VECTOR:
  335. PackInt ( dNodes.GetLength() );
  336. ARRAY_FOREACH ( i, dNodes )
  337. StoreBigint ( sphD2QW ( dNodes[i].m_fValue ) );
  338. break;
  339. case JSON_STRING_VECTOR:
  340. {
  341. int iOfs = ReserveSize();
  342. PackInt ( dNodes.GetLength() );
  343. ARRAY_FOREACH ( i, dNodes )
  344. PackNodeStr ( dNodes[i] );
  345. PackSize ( iOfs );
  346. break;
  347. }
  348. default:
  349. assert ( 0 && "internal error: unhandled type" );
  350. return false;
  351. }
  352. return true;
  353. }
  354. void DebugIndent ( int iLevel )
  355. {
  356. for ( int i=0; i<iLevel; i++ )
  357. printf ( " " );
  358. }
  359. void DebugDump ( ESphJsonType eType, const BYTE ** ppData, int iLevel )
  360. {
  361. DebugIndent ( iLevel );
  362. const BYTE * p = *ppData;
  363. switch ( eType )
  364. {
  365. case JSON_INT32: printf ( "JSON_INT32 %d\n", sphJsonLoadInt ( &p ) ); break;
  366. case JSON_INT64: printf ( "JSON_INT64 "INT64_FMT"\n", sphJsonLoadBigint ( &p ) ); break;
  367. case JSON_DOUBLE: printf ( "JSON_DOUBLE %lf\n", sphQW2D ( sphJsonLoadBigint ( &p ) ) ); break;
  368. case JSON_STRING:
  369. {
  370. int iLen = sphJsonUnpackInt ( &p );
  371. CSphString sVal;
  372. sVal.SetBinary ( (const char*)p, iLen );
  373. printf ( "JSON_STRING \"%s\"\n", sVal.cstr() );
  374. p += iLen;
  375. break;
  376. }
  377. case JSON_TRUE: printf ( "JSON_TRUE\n" ); break;
  378. case JSON_FALSE: printf ( "JSON_FALSE\n" ); break;
  379. case JSON_NULL: printf ( "JSON_NULL\n" ); break;
  380. case JSON_EOF: printf ( "JSON_EOF\n" ); break;
  381. // associative arrays
  382. case JSON_ROOT:
  383. case JSON_OBJECT:
  384. {
  385. if ( eType==JSON_OBJECT )
  386. sphJsonUnpackInt ( &p );
  387. DWORD uMask = sphGetDword(p);
  388. printf ( "%s (bloom mask: 0x%08x)\n", eType==JSON_OBJECT ? "JSON_OBJECT" : "JSON_ROOT", uMask );
  389. p += 4; // skip bloom table
  390. for ( ;; )
  391. {
  392. ESphJsonType eInnerType = (ESphJsonType) *p++;
  393. if ( eInnerType==JSON_EOF )
  394. break;
  395. const int iStrLen = sphJsonUnpackInt ( &p );
  396. CSphString sVal;
  397. sVal.SetBinary ( (const char*)p, iStrLen );
  398. DebugIndent ( iLevel+1 );
  399. printf ( "\"%s\"", sVal.cstr() );
  400. p += iStrLen;
  401. DebugDump ( eInnerType, &p, iLevel+1 );
  402. }
  403. break;
  404. }
  405. case JSON_MIXED_VECTOR:
  406. {
  407. int iTotalLen = sphJsonUnpackInt ( &p );
  408. int iLen = sphJsonUnpackInt ( &p );
  409. printf ( "JSON_MIXED_VECTOR [%d] (%d bytes)\n", iLen, iTotalLen );
  410. for ( int i=0; i<iLen; i++ )
  411. {
  412. ESphJsonType eInnerType = (ESphJsonType)*p++;
  413. DebugDump ( eInnerType, &p, iLevel+1 );
  414. }
  415. break;
  416. }
  417. // optimized arrays ( note they can't be empty )
  418. case JSON_STRING_VECTOR:
  419. {
  420. sphJsonUnpackInt ( &p );
  421. int iLen = sphJsonUnpackInt ( &p );
  422. printf ( "JSON_STRING_VECTOR (%d) [", iLen );
  423. for ( int i=0; i<iLen; i++ )
  424. {
  425. int iStrLen = sphJsonUnpackInt ( &p );
  426. CSphString sVal;
  427. sVal.SetBinary ( (const char*)p, iStrLen );
  428. printf ( "\"%s\"%s", sVal.cstr(), i<iLen-1 ? "," : "]\n" );
  429. p += iStrLen;
  430. }
  431. break;
  432. }
  433. case JSON_INT32_VECTOR:
  434. {
  435. int iLen = sphJsonUnpackInt ( &p );
  436. printf ( "JSON_INT32_VECTOR (%d) [", iLen );
  437. for ( int i=0; i<iLen; i++ )
  438. printf ( "%d%s", sphJsonLoadInt ( &p ), i<iLen-1 ? "," : "]\n" );
  439. break;
  440. }
  441. case JSON_INT64_VECTOR:
  442. {
  443. int iLen = sphJsonUnpackInt ( &p );
  444. printf ( "JSON_INT64_VECTOR (%d) [", iLen );
  445. for ( int i=0; i<iLen; i++ )
  446. printf ( INT64_FMT"%s", sphJsonLoadBigint ( &p ), i<iLen-1 ? "," : "]\n" );
  447. break;
  448. }
  449. case JSON_DOUBLE_VECTOR:
  450. {
  451. int iLen = sphJsonUnpackInt ( &p );
  452. printf ( "JSON_DOUBLE_VECTOR (%d) [", iLen );
  453. for ( int i=0; i<iLen; i++ )
  454. printf ( "%lf%s", sphQW2D ( sphJsonLoadBigint ( &p ) ), i<iLen-1 ? "," : "]\n" );
  455. break;
  456. }
  457. default:
  458. printf ( "UNKNOWN\n" );
  459. break;
  460. }
  461. *ppData = p;
  462. }
  463. void DebugDump ( const BYTE * p )
  464. {
  465. CSphVector<BYTE> dOut;
  466. sphJsonFormat ( dOut, m_dBuffer.Begin() );
  467. dOut.Add ( '\0' );
  468. printf ( "sphJsonFormat: %s\n", (char*)dOut.Begin() );
  469. printf ( "Blob size: %d bytes\n", m_dBuffer.GetLength() );
  470. ESphJsonType eType = sphJsonFindFirst ( &p );
  471. DebugDump ( eType, &p, 0 );
  472. printf ( "\n" );
  473. }
  474. };
  475. // unused parameter, simply to avoid type clash between all my yylex() functions
  476. #define YY_NO_UNISTD_H 1
  477. #define YYLEX_PARAM pParser->m_pScanner, pParser
  478. #define YY_DECL int yylex ( YYSTYPE * lvalp, void * yyscanner, JsonParser_c * pParser )
  479. #include "llsphinxjson.c"
  480. void yyerror ( JsonParser_c * pParser, const char * sMessage )
  481. {
  482. yy2lex_unhold ( pParser->m_pScanner );
  483. pParser->m_sError.SetSprintf ( "%s near '%s'", sMessage, pParser->m_pLastToken );
  484. }
  485. #include "yysphinxjson.c"
  486. bool sphJsonParse ( CSphVector<BYTE> & dData, char * sData, bool bAutoconv, bool bToLowercase, CSphString & sError )
  487. {
  488. int iLen = strlen ( sData );
  489. if ( sData[iLen+1]!=0 )
  490. {
  491. sError = "internal error: input data passed to sphJsonParse() must be terminated with a double zero";
  492. return false;
  493. }
  494. JsonParser_c tParser ( dData, bAutoconv, bToLowercase, sError );
  495. yy2lex_init ( &tParser.m_pScanner );
  496. tParser.m_pBuf = sData; // sphJsonParse() is intentionally destructive, no need to copy data here
  497. YY_BUFFER_STATE tLexerBuffer = yy2_scan_buffer ( sData, iLen+2, tParser.m_pScanner );
  498. if ( !tLexerBuffer )
  499. {
  500. sError = "internal error: yy_scan_buffer() failed";
  501. return false;
  502. }
  503. int iRes = yyparse ( &tParser );
  504. yy2_delete_buffer ( tLexerBuffer, tParser.m_pScanner );
  505. yy2lex_destroy ( tParser.m_pScanner );
  506. tParser.Finalize();
  507. if ( iRes!=0 )
  508. dData.Reset();
  509. return iRes==0;
  510. }
  511. //////////////////////////////////////////////////////////////////////////
  512. DWORD sphJsonKeyMask ( const char * sKey, int iLen )
  513. {
  514. DWORD uCrc = sphCRC32 ( sKey, iLen );
  515. return
  516. ( 1UL<<( uCrc & 31 ) ) +
  517. ( 1UL<<( ( uCrc>>8 ) & 31 ) );
  518. }
  519. // returns -1 if size is unreachable (for remote agents)
  520. int sphJsonNodeSize ( ESphJsonType eType, const BYTE *pData )
  521. {
  522. int iLen;
  523. const BYTE * p = pData;
  524. switch ( eType )
  525. {
  526. case JSON_INT32:
  527. return 4;
  528. case JSON_INT64:
  529. case JSON_DOUBLE:
  530. return 8;
  531. case JSON_INT32_VECTOR:
  532. if ( !p )
  533. return -1;
  534. iLen = sphJsonUnpackInt ( &p );
  535. return p - pData + iLen * 4;
  536. case JSON_INT64_VECTOR:
  537. case JSON_DOUBLE_VECTOR:
  538. if ( !p )
  539. return -1;
  540. iLen = sphJsonUnpackInt ( &p );
  541. return p - pData + iLen * 8;
  542. case JSON_STRING:
  543. case JSON_STRING_VECTOR:
  544. case JSON_MIXED_VECTOR:
  545. case JSON_OBJECT:
  546. if ( !p )
  547. return -1;
  548. iLen = sphJsonUnpackInt ( &p );
  549. return p - pData + iLen;
  550. case JSON_ROOT:
  551. if ( !p )
  552. return -1;
  553. p += 4; // skip filter
  554. for ( ;; )
  555. {
  556. ESphJsonType eNode = (ESphJsonType) *p++;
  557. if ( eNode==JSON_EOF )
  558. break;
  559. // skip key and node
  560. iLen = sphJsonUnpackInt ( &p );
  561. p += iLen;
  562. sphJsonSkipNode ( eNode, &p );
  563. }
  564. return p - pData;
  565. default:
  566. return 0;
  567. }
  568. }
  569. void sphJsonSkipNode ( ESphJsonType eType, const BYTE ** ppData )
  570. {
  571. int iSize = sphJsonNodeSize ( eType, *ppData );
  572. *ppData += iSize;
  573. }
  574. int sphJsonFieldLength ( ESphJsonType eType, const BYTE * pData )
  575. {
  576. const BYTE * p = pData;
  577. int iCount = 0;
  578. switch ( eType )
  579. {
  580. case JSON_INT32:
  581. case JSON_INT64:
  582. case JSON_DOUBLE:
  583. return 1;
  584. case JSON_STRING_VECTOR:
  585. case JSON_MIXED_VECTOR:
  586. sphJsonUnpackInt ( &p );
  587. return sphJsonUnpackInt ( &p );
  588. case JSON_INT32_VECTOR:
  589. case JSON_INT64_VECTOR:
  590. case JSON_DOUBLE_VECTOR:
  591. return sphJsonUnpackInt ( &p );
  592. case JSON_OBJECT:
  593. case JSON_ROOT:
  594. if ( eType==JSON_OBJECT )
  595. sphJsonUnpackInt ( &p ); // skip size
  596. p += 4; // skip filter
  597. for ( ;; )
  598. {
  599. ESphJsonType eNode = (ESphJsonType) *p++;
  600. if ( eNode==JSON_EOF )
  601. break;
  602. int iLen = sphJsonUnpackInt ( &p );
  603. p += iLen;
  604. sphJsonSkipNode ( eNode, &p );
  605. iCount++;
  606. }
  607. return iCount;
  608. default:
  609. return 0;
  610. }
  611. }
  612. ESphJsonType sphJsonFindFirst ( const BYTE ** ppData )
  613. {
  614. // non-zero bloom mask? that is JSON_ROOT (basically a JSON_OBJECT without node header)
  615. if ( sphGetDword(*ppData) )
  616. return JSON_ROOT;
  617. // zero mask? must be followed by the type byte (typically JSON_EOF)
  618. ESphJsonType eType = (ESphJsonType)((*ppData)[4]);
  619. *ppData += 5;
  620. return eType;
  621. }
  622. ESphJsonType sphJsonFindByKey ( ESphJsonType eType, const BYTE ** ppValue, const void * pKey, int iLen, DWORD uMask )
  623. {
  624. if ( eType!=JSON_OBJECT && eType!=JSON_ROOT )
  625. return JSON_EOF;
  626. const BYTE * p = *ppValue;
  627. if ( eType==JSON_OBJECT )
  628. sphJsonUnpackInt ( &p );
  629. if ( ( sphGetDword(p) & uMask )!=uMask )
  630. return JSON_EOF;
  631. p += 4;
  632. for ( ;; )
  633. {
  634. eType = (ESphJsonType) *p++;
  635. if ( eType==JSON_EOF )
  636. break;
  637. int iStrLen = sphJsonUnpackInt ( &p );
  638. p += iStrLen;
  639. if ( iStrLen==iLen && !memcmp ( p-iStrLen, pKey, iStrLen ) )
  640. {
  641. *ppValue = p;
  642. return eType;
  643. }
  644. sphJsonSkipNode ( eType, &p );
  645. }
  646. return JSON_EOF;
  647. }
  648. ESphJsonType sphJsonFindByIndex ( ESphJsonType eType, const BYTE ** ppValue, int iIndex )
  649. {
  650. if ( iIndex<0 )
  651. return JSON_EOF;
  652. const BYTE * p = *ppValue;
  653. switch ( eType )
  654. {
  655. case JSON_INT32_VECTOR:
  656. case JSON_INT64_VECTOR:
  657. case JSON_DOUBLE_VECTOR:
  658. {
  659. int iLen = sphJsonUnpackInt ( &p );
  660. if ( iIndex>=iLen )
  661. return JSON_EOF;
  662. p += iIndex * ( eType==JSON_INT32_VECTOR ? 4 : 8 );
  663. *ppValue = p;
  664. return eType==JSON_INT32_VECTOR ? JSON_INT32
  665. : eType==JSON_INT64_VECTOR ? JSON_INT64
  666. : JSON_DOUBLE;
  667. }
  668. case JSON_STRING_VECTOR:
  669. {
  670. sphJsonUnpackInt ( &p );
  671. int iLen = sphJsonUnpackInt ( &p );
  672. if ( iIndex>=iLen )
  673. return JSON_EOF;
  674. for ( int i=0; i<iIndex; i++ )
  675. {
  676. int iStrLen = sphJsonUnpackInt ( &p );
  677. p += iStrLen;
  678. }
  679. *ppValue = p;
  680. return JSON_STRING;
  681. }
  682. case JSON_MIXED_VECTOR:
  683. {
  684. sphJsonUnpackInt ( &p );
  685. int iLen = sphJsonUnpackInt ( &p );
  686. if ( iIndex>=iLen )
  687. return JSON_EOF;
  688. for ( int i=0; i<iIndex; i++ )
  689. {
  690. eType = (ESphJsonType)*p++;
  691. sphJsonSkipNode ( eType, &p );
  692. }
  693. eType = (ESphJsonType)*p;
  694. *ppValue = p+1;
  695. return eType;
  696. }
  697. default:
  698. return JSON_EOF;
  699. break;
  700. }
  701. }
  702. //////////////////////////////////////////////////////////////////////////
  703. static const BYTE * JsonFormatStr ( CSphVector<BYTE> & dOut, const BYTE * p, bool bQuote=true )
  704. {
  705. int iLen = sphJsonUnpackInt ( &p );
  706. dOut.Reserve ( dOut.GetLength()+iLen );
  707. if ( bQuote )
  708. dOut.Add ( '"' );
  709. while ( iLen-- )
  710. {
  711. if ( bQuote )
  712. {
  713. switch ( *p )
  714. {
  715. case '\b': dOut.Add('\\'); dOut.Add('b'); break;
  716. case '\n': dOut.Add('\\'); dOut.Add('n'); break;
  717. case '\r': dOut.Add('\\'); dOut.Add('r'); break;
  718. case '\t': dOut.Add('\\'); dOut.Add('t'); break;
  719. case '\f': dOut.Add('\\'); dOut.Add('f'); break; // formfeed (rfc 4627)
  720. default:
  721. if ( *p == '"' || *p=='\\' || *p=='/' )
  722. dOut.Add ( '\\' );
  723. dOut.Add ( *p );
  724. }
  725. } else
  726. dOut.Add ( *p );
  727. p++;
  728. }
  729. if ( bQuote )
  730. dOut.Add ( '"' );
  731. return p;
  732. }
  733. void JsonAddStr ( CSphVector<BYTE> & dOut, const char * pStr )
  734. {
  735. while ( *pStr )
  736. dOut.Add ( *pStr++ );
  737. }
  738. void sphJsonFormat ( CSphVector<BYTE> & dOut, const BYTE * pData )
  739. {
  740. if ( !pData )
  741. return;
  742. ESphJsonType eType = sphJsonFindFirst ( &pData );
  743. // check for the empty root
  744. if ( eType==JSON_EOF )
  745. {
  746. JsonAddStr ( dOut, "{}" );
  747. return;
  748. }
  749. sphJsonFieldFormat ( dOut, pData, eType );
  750. }
  751. const BYTE * sphJsonFieldFormat ( CSphVector<BYTE> & dOut, const BYTE * pData, ESphJsonType eType, bool bQuoteString )
  752. {
  753. const BYTE * p = pData;
  754. // format value
  755. switch ( eType )
  756. {
  757. case JSON_INT32:
  758. {
  759. int iOff = dOut.GetLength();
  760. dOut.Resize ( iOff+32 );
  761. int iLen = snprintf ( (char *)dOut.Begin()+iOff, 32, "%d", sphJsonLoadInt ( &p ) ); // NOLINT
  762. dOut.Resize ( iOff+iLen );
  763. break;
  764. }
  765. case JSON_INT64:
  766. {
  767. int iOff = dOut.GetLength();
  768. dOut.Resize ( iOff+32 );
  769. int iLen = snprintf ( (char *)dOut.Begin()+iOff, 32, INT64_FMT, sphJsonLoadBigint ( &p ) ); // NOLINT
  770. dOut.Resize ( iOff+iLen );
  771. break;
  772. }
  773. case JSON_DOUBLE:
  774. {
  775. int iOff = dOut.GetLength();
  776. dOut.Resize ( iOff+32 );
  777. int iLen = snprintf ( (char *)dOut.Begin()+iOff, 32, "%lf", sphQW2D ( sphJsonLoadBigint ( &p ) ) ); // NOLINT
  778. dOut.Resize ( iOff+iLen );
  779. break;
  780. }
  781. case JSON_STRING:
  782. p = JsonFormatStr ( dOut, p, bQuoteString );
  783. break;
  784. case JSON_STRING_VECTOR:
  785. {
  786. int iLen = sphJsonUnpackInt ( &p );
  787. dOut.Reserve ( dOut.GetLength()+iLen );
  788. int iVals = sphJsonUnpackInt ( &p );
  789. dOut.Add ( '[' );
  790. for ( int i=0; i<iVals; i++ )
  791. {
  792. if ( i>0 )
  793. dOut.Add ( ',' );
  794. p = JsonFormatStr ( dOut, p );
  795. }
  796. dOut.Add ( ']' );
  797. break;
  798. }
  799. case JSON_INT32_VECTOR:
  800. case JSON_INT64_VECTOR:
  801. case JSON_DOUBLE_VECTOR:
  802. {
  803. int iVals = sphJsonUnpackInt ( &p );
  804. dOut.Add ( '[' );
  805. for ( int i=0; i<iVals; i++ )
  806. {
  807. if ( i>0 )
  808. dOut.Add ( ',' );
  809. int iOff = dOut.GetLength();
  810. dOut.Resize ( iOff+32 );
  811. int iLen = 0;
  812. char * b = (char *)dOut.Begin()+iOff;
  813. switch ( eType )
  814. {
  815. case JSON_INT32_VECTOR: iLen = snprintf ( b, 32, "%d", sphJsonLoadInt ( &p ) ); break; // NOLINT
  816. case JSON_INT64_VECTOR: iLen = snprintf ( b, 32, INT64_FMT, sphJsonLoadBigint ( &p ) ); break; // NOLINT
  817. case JSON_DOUBLE_VECTOR: iLen = snprintf ( b, 32, "%lf", sphQW2D ( sphJsonLoadBigint ( &p ) ) ); break; // NOLINT
  818. default:
  819. break;
  820. }
  821. dOut.Resize ( iOff+iLen );
  822. }
  823. dOut.Add ( ']' );
  824. break;
  825. }
  826. case JSON_MIXED_VECTOR:
  827. {
  828. sphJsonUnpackInt ( &p );
  829. int iVals = sphJsonUnpackInt ( &p );
  830. dOut.Add ( '[' );
  831. for ( int i=0; i<iVals; i++ )
  832. {
  833. if ( i>0 )
  834. dOut.Add ( ',' );
  835. ESphJsonType eNode = (ESphJsonType) *p++;
  836. p = sphJsonFieldFormat ( dOut, p, eNode, true );
  837. }
  838. dOut.Add ( ']' );
  839. break;
  840. }
  841. case JSON_ROOT:
  842. case JSON_OBJECT:
  843. {
  844. if ( eType==JSON_OBJECT )
  845. sphJsonUnpackInt ( &p );
  846. p += 4; // skip bloom table
  847. dOut.Add ( '{' );
  848. for ( int i=0;;i++ )
  849. {
  850. ESphJsonType eNode = (ESphJsonType) *p++;
  851. if ( eNode==JSON_EOF )
  852. break;
  853. if ( i>0 )
  854. dOut.Add ( ',' );
  855. p = JsonFormatStr ( dOut, p );
  856. dOut.Add ( ':' );
  857. p = sphJsonFieldFormat ( dOut, p, eNode, true );
  858. }
  859. dOut.Add ( '}' );
  860. break;
  861. }
  862. case JSON_TRUE: JsonAddStr ( dOut, bQuoteString ? "true" : "1" ); break;
  863. case JSON_FALSE: JsonAddStr ( dOut, bQuoteString ? "false" : "0" ); break;
  864. case JSON_NULL: JsonAddStr ( dOut, bQuoteString ? "null" : "" ); break;
  865. case JSON_EOF: break;
  866. case JSON_TOTAL: break;
  867. }
  868. return p;
  869. }
  870. bool sphJsonNameSplit ( const char * sName, CSphString * sColumn, CSphString * sKey )
  871. {
  872. if ( !sName )
  873. return false;
  874. // find either '[' or '.', what comes first
  875. const char * pSep = sName;
  876. while ( *pSep && *pSep!='.' && *pSep!='[' )
  877. {
  878. // check for invalid characters
  879. if ( !sphIsAttr( *pSep ) && *pSep!=' ' )
  880. return false;
  881. pSep++;
  882. }
  883. if ( !*pSep )
  884. return false;
  885. int iSep = pSep - sName;
  886. if ( sColumn )
  887. {
  888. sColumn->SetBinary ( sName, iSep );
  889. sColumn->Trim();
  890. }
  891. if ( sKey )
  892. *sKey = sName + iSep + ( *pSep=='.' ? 1 : 0 );
  893. return true;
  894. }
  895. JsonKey_t::JsonKey_t ()
  896. : m_uMask ( 0 )
  897. , m_iLen ( 0 )
  898. {}
  899. JsonKey_t::JsonKey_t ( const char * sKey, int iLen )
  900. {
  901. m_iLen = iLen;
  902. m_uMask = sphJsonKeyMask ( sKey, m_iLen );
  903. m_sKey.SetBinary ( sKey, m_iLen );
  904. }
  905. void JsonStoreInt ( BYTE * p, int v )
  906. {
  907. *p++ = BYTE(DWORD(v));
  908. *p++ = BYTE(DWORD(v) >> 8);
  909. *p++ = BYTE(DWORD(v) >> 16);
  910. *p++ = BYTE(DWORD(v) >> 24);
  911. }
  912. void JsonStoreBigint ( BYTE * p, int64_t v )
  913. {
  914. JsonStoreInt ( p, (DWORD)( v & 0xffffffffUL ) );
  915. JsonStoreInt ( p+4, (int)( v>>32 ) );
  916. }
  917. bool sphJsonInplaceUpdate ( ESphJsonType eValueType, int64_t iValue, ISphExpr * pExpr, BYTE * pStrings, const CSphRowitem * pRow, bool bUpdate )
  918. {
  919. if ( !pExpr || !pStrings )
  920. return false;
  921. pExpr->Command ( SPH_EXPR_SET_STRING_POOL, (void*)pStrings );
  922. CSphMatch tMatch;
  923. tMatch.m_pStatic = pRow;
  924. uint64_t uPacked = pExpr->Int64Eval ( tMatch );
  925. BYTE * pData = pStrings + ( uPacked & 0xffffffff );
  926. ESphJsonType eType = (ESphJsonType)( uPacked >> 32 );
  927. switch ( eType )
  928. {
  929. case JSON_INT32:
  930. if ( eValueType==JSON_DOUBLE )
  931. iValue = (int64_t)sphQW2D ( iValue );
  932. if ( int64_t(int(iValue))!=iValue )
  933. return false;
  934. if ( bUpdate )
  935. JsonStoreInt ( pData, (int)iValue );
  936. break;
  937. case JSON_INT64:
  938. if ( bUpdate )
  939. JsonStoreBigint ( pData, eValueType==JSON_DOUBLE ? (int64_t)sphQW2D ( iValue ) : iValue );
  940. break;
  941. case JSON_DOUBLE:
  942. if ( bUpdate )
  943. JsonStoreBigint ( pData, eValueType==JSON_DOUBLE ? iValue : sphD2QW ( (double)iValue ) );
  944. break;
  945. default:
  946. return false;
  947. }
  948. return true;
  949. }
  950. bool sphJsonStringToNumber ( const char * s, int iLen, ESphJsonType & eType, int64_t & iVal, double & fVal )
  951. {
  952. // skip whitespace
  953. while ( iLen>0 && ( *s==' ' || *s=='\n' || *s=='\r' || *s=='\t' || *s=='\f' ) )
  954. s++, iLen--;
  955. if ( iLen<=0 )
  956. return false;
  957. // check whether the string looks like a numeric
  958. const char * p = s;
  959. const char * pEnd = p+iLen-1;
  960. bool bNumeric = ( *p=='-' || *p=='.' || ( *p>='0' && *p<='9' ) );
  961. bool bDot = ( *p=='.' );
  962. bool bExp = false;
  963. bool bExpSign = false;
  964. while ( bNumeric && p<pEnd )
  965. {
  966. p++;
  967. switch ( *p )
  968. {
  969. case '.':
  970. if ( bDot )
  971. bNumeric = false;
  972. bDot = true;
  973. break;
  974. case 'e':
  975. case 'E':
  976. if ( bExp )
  977. bNumeric = false;
  978. bExp = true;
  979. break;
  980. case '-':
  981. case '+':
  982. if ( !bExp || bExpSign )
  983. bNumeric = false;
  984. bExpSign = true;
  985. break;
  986. default:
  987. if ( *p<'0' || *p >'9' )
  988. bNumeric = false;
  989. }
  990. }
  991. // convert string to number
  992. if ( bNumeric && iLen<32 )
  993. {
  994. char sVal[32];
  995. memcpy ( sVal, s, iLen );
  996. sVal[iLen] = '\0';
  997. char * pCur;
  998. // setting errno to zero is necessary because strtod/strtoll do not indicate
  999. // whether it was an overflow or a valid input for borderline values
  1000. errno = 0;
  1001. if ( bDot || bExp )
  1002. {
  1003. double fRes = strtod ( sVal, &pCur );
  1004. if ( pCur==sVal+iLen && errno!=ERANGE )
  1005. {
  1006. eType = JSON_DOUBLE;
  1007. fVal = fRes;
  1008. return true;
  1009. }
  1010. } else
  1011. {
  1012. int64_t iRes = strtoll ( sVal, &pCur, 10 );
  1013. if ( pCur==sVal+iLen && errno!=ERANGE )
  1014. {
  1015. eType = JSON_INT64;
  1016. iVal = iRes;
  1017. return true;
  1018. }
  1019. }
  1020. }
  1021. return false;
  1022. }
  1023. //
  1024. // $Id$
  1025. //