/src/sphinxjson.cpp

https://github.com/smparkes/sphinxsearch · C++ · 1102 lines · 957 code · 95 blank · 50 comment · 97 complexity · 5d2146a4c107de0cf9c609228ce29a74 MD5 · raw file

  1. //
  2. // $Id$
  3. //
  4. //
  5. // Copyright (c) 2011, Andrew Aksyonoff
  6. // Copyright (c) 2011, Sphinx Technologies Inc
  7. // All rights reserved
  8. //
  9. // This program is free software; you can redistribute it and/or modify
  10. // it under the terms of the GNU General Public License. You should have
  11. // received a copy of the GPL license along with this program; if you
  12. // did not, you can find it at http://www.gnu.org/
  13. //
  14. #include "sphinxjson.h"
  15. #include "sphinxint.h"
  16. #if USE_WINDOWS
  17. #include <io.h> // for isatty() in llsphinxjson.c
  18. #endif
  19. //////////////////////////////////////////////////////////////////////////
  20. /// parser view on a generic node
  21. struct JsonNode_t
  22. {
  23. ESphJsonType m_eType; ///< node type
  24. int64_t m_iValue; ///< integer value, only used for JSON_INT32 and JSON_INT64
  25. double m_fValue; ///< floating point value, only used for JSON_DOUBLE
  26. int m_iStart; ///< string value, start index (inclusive) into m_pBuf, only used for JSON_STRING
  27. int m_iEnd; ///< string value, end index (exclusive) into m_pBuf, only used for JSON_STRING
  28. int m_iHandle; ///< subobject value, index into m_dNodes storage
  29. int m_iKeyStart; ///< node name, start index (inclusive) into m_pBuf
  30. int m_iKeyEnd; ///< node name, end index (exclusive) into m_pBuf
  31. JsonNode_t ()
  32. : m_eType ( JSON_TOTAL )
  33. {}
  34. };
  35. #define YYSTYPE JsonNode_t
  36. // must be included after YYSTYPE declaration
  37. #include "yysphinxjson.h"
  38. /// actually, JSON-to-SphinxBSON converter helper, but who cares
  39. class JsonParser_c : ISphNoncopyable
  40. {
  41. public:
  42. void * m_pScanner;
  43. const char * m_pLastToken;
  44. CSphVector<BYTE> & m_dBuffer;
  45. CSphString & m_sError;
  46. bool m_bAutoconv;
  47. bool m_bToLowercase;
  48. char * m_pBuf;
  49. CSphVector < CSphVector<JsonNode_t> > m_dNodes;
  50. CSphVector<JsonNode_t> m_dEmpty;
  51. public:
  52. JsonParser_c ( CSphVector<BYTE> & dBuffer, bool bAutoconv, bool bToLowercase, CSphString & sError )
  53. : m_pScanner ( NULL )
  54. , m_pLastToken ( NULL )
  55. , m_dBuffer ( dBuffer )
  56. , m_sError ( sError )
  57. , m_bAutoconv ( bAutoconv )
  58. , m_bToLowercase ( bToLowercase )
  59. {
  60. // reserve 4 bytes for Bloom mask
  61. StoreInt ( 0 );
  62. }
  63. protected:
  64. BYTE * BufAlloc ( int iLen )
  65. {
  66. int iPos = m_dBuffer.GetLength();
  67. m_dBuffer.Resize ( m_dBuffer.GetLength()+iLen );
  68. return m_dBuffer.Begin()+iPos;
  69. }
  70. void StoreInt ( int v )
  71. {
  72. BYTE * p = BufAlloc ( 4 );
  73. *p++ = BYTE(DWORD(v));
  74. *p++ = BYTE(DWORD(v) >> 8);
  75. *p++ = BYTE(DWORD(v) >> 16);
  76. *p++ = BYTE(DWORD(v) >> 24);
  77. }
  78. void StoreBigint ( int64_t v )
  79. {
  80. StoreInt ( (DWORD)( v & 0xffffffffUL ) );
  81. StoreInt ( (int)( v>>32 ) );
  82. }
  83. int PackLen ( DWORD v )
  84. {
  85. if ( v<=251 )
  86. return 1;
  87. else if ( v<65536 )
  88. return 3;
  89. else if ( v<16777216 )
  90. return 4;
  91. else
  92. return 5;
  93. }
  94. void PackInt ( DWORD v )
  95. {
  96. assert ( v<16777216 ); // strings over 16M bytes and arrays over 16M entries are not supported
  97. if ( v<252 )
  98. {
  99. m_dBuffer.Add ( BYTE(v) );
  100. } else if ( v<65536 )
  101. {
  102. m_dBuffer.Add ( 252 );
  103. m_dBuffer.Add ( BYTE ( v & 255 ) );
  104. m_dBuffer.Add ( BYTE ( v>>8 ) );
  105. } else
  106. {
  107. m_dBuffer.Add ( 253 );
  108. m_dBuffer.Add ( BYTE ( v & 255 ) );
  109. m_dBuffer.Add ( BYTE ( ( v>>8 ) & 255 ) );
  110. m_dBuffer.Add ( BYTE ( v>>16 ) );
  111. }
  112. }
  113. void PackStr ( const char * s, int iLen )
  114. {
  115. iLen = Min ( iLen, 0xffffff );
  116. PackInt ( iLen );
  117. if ( iLen )
  118. {
  119. BYTE * p = BufAlloc ( iLen );
  120. memcpy ( p, s, iLen );
  121. }
  122. }
  123. int JsonUnescape ( char ** pEscaped, int iLen )
  124. {
  125. assert ( pEscaped );
  126. char * s = *pEscaped;
  127. // skip heading and trailing quotes
  128. if ( ( s[0]=='\'' && s[iLen-1]=='\'' ) || ( s[0]=='"' && s[iLen-1]=='"' ) )
  129. {
  130. s++;
  131. iLen -= 2;
  132. }
  133. char * sMax = s+iLen;
  134. char * d = s;
  135. char * pStart = d;
  136. char sBuf[8] = { 0 };
  137. while ( s<sMax )
  138. {
  139. if ( s[0]=='\\' )
  140. {
  141. switch ( s[1] )
  142. {
  143. case 'b': *d++ = '\b'; break;
  144. case 'n': *d++ = '\n'; break;
  145. case 'r': *d++ = '\r'; break;
  146. case 't': *d++ = '\t'; break;
  147. case 'f': *d++ = '\f'; break; // formfeed (rfc 4627)
  148. case 'u':
  149. // convert 6-byte sequences \u four-hex-digits (rfc 4627) to UTF-8
  150. if ( s+6<=sMax && isxdigit ( s[2] ) && isxdigit ( s[3] ) && isxdigit ( s[4] ) && isxdigit ( s[5] ) )
  151. {
  152. memcpy ( sBuf, s+2, 4 );
  153. d += sphUTF8Encode ( (BYTE*)d, (int)strtol ( sBuf, NULL, 16 ) );
  154. s += 4;
  155. } else
  156. *d++ = s[1];
  157. break;
  158. default:
  159. *d++ = s[1];
  160. }
  161. s += 2;
  162. } else
  163. *d++ = *s++;
  164. }
  165. *pEscaped = pStart;
  166. return d - pStart;
  167. }
  168. void PackNodeStr ( const JsonNode_t & tNode )
  169. {
  170. int iLen = tNode.m_iEnd-tNode.m_iStart;
  171. char *s = m_pBuf + tNode.m_iStart;
  172. iLen = JsonUnescape ( &s, iLen );
  173. PackStr ( s, iLen );
  174. }
  175. int KeyUnescape ( char ** ppKey, int iLen )
  176. {
  177. char * s = *ppKey;
  178. iLen = JsonUnescape ( &s, iLen );
  179. if ( m_bToLowercase )
  180. for ( int i=0; i<iLen; i++ )
  181. s[i] = (char)tolower ( s[i] ); // OPTIMIZE! not sure if significant, but known to be hell slow
  182. *ppKey = s;
  183. return iLen;
  184. }
  185. void StoreMask ( int iOfs, DWORD uMask )
  186. {
  187. for ( int i=0; i<4; i++ )
  188. {
  189. m_dBuffer[iOfs+i] = BYTE ( uMask & 0xff );
  190. uMask >>= 8;
  191. }
  192. }
  193. /// reserve a single byte for a yet-unknown length, to be written later with PackSize()
  194. /// returns its offset, to be used by PackSize() to both calculate and stored the length
  195. int ReserveSize()
  196. {
  197. int iOfs = m_dBuffer.GetLength();
  198. m_dBuffer.Resize ( iOfs+1 );
  199. return iOfs;
  200. }
  201. /// compute current length from the offset reserved with ReserveSize(), and pack the value back there
  202. /// in most cases that single byte is enough; if not, we make room by memmove()ing the data
  203. void PackSize ( int iOfs )
  204. {
  205. int iSize = m_dBuffer.GetLength()-iOfs-1;
  206. int iPackLen = PackLen ( iSize );
  207. if ( iPackLen!=1 )
  208. {
  209. m_dBuffer.Resize ( iOfs+iPackLen+iSize );
  210. memmove ( m_dBuffer.Begin()+iOfs+iPackLen, m_dBuffer.Begin()+iOfs+1, iSize );
  211. }
  212. m_dBuffer.Resize ( iOfs );
  213. PackInt ( iSize );
  214. m_dBuffer.Resize ( iOfs+iPackLen+iSize );
  215. }
  216. public:
  217. void Finalize()
  218. {
  219. m_dBuffer.Add ( JSON_EOF );
  220. }
  221. void NumericFixup ( JsonNode_t & tNode, bool bAutoconv )
  222. {
  223. // parser emits int64 values, fixup them to int32
  224. if ( tNode.m_eType==JSON_INT64 )
  225. {
  226. int iVal = int(tNode.m_iValue);
  227. if ( tNode.m_iValue==int64_t(iVal) )
  228. tNode.m_eType = JSON_INT32;
  229. return;
  230. }
  231. // check for the autoconversion of string values
  232. if ( !bAutoconv || tNode.m_eType!=JSON_STRING )
  233. return;
  234. // check whether the (quoted) string looks like a numeric
  235. int iLen = tNode.m_iEnd - tNode.m_iStart - 2;
  236. if ( iLen<=0 || iLen>=32 )
  237. return;
  238. const char * sValue = m_pBuf + tNode.m_iStart+1;
  239. const char * p = sValue;
  240. const char * pEnd = p+iLen-1;
  241. bool bNumeric = ( *p=='-' || *p=='.' || ( *p>='0' && *p<='9' ) );
  242. bool bDot = ( *p=='.' );
  243. while ( bNumeric && p<pEnd )
  244. {
  245. p++;
  246. if ( *p=='.' )
  247. {
  248. if ( bDot )
  249. bNumeric = false;
  250. bDot = true;
  251. } else
  252. {
  253. if ( *p<'0' || *p >'9' )
  254. bNumeric = false;
  255. }
  256. }
  257. if ( !bNumeric )
  258. return;
  259. // ok, looks numeric, try integer conversion
  260. // OPTIMIZE?
  261. char sVal[32];
  262. memcpy ( sVal, sValue, iLen );
  263. sVal[iLen] = '\0';
  264. if ( !bDot )
  265. {
  266. int64_t iVal = strtoll ( sVal, NULL, 10 );
  267. snprintf ( sVal, sizeof(sVal), INT64_FMT, iVal );
  268. if ( !memcmp ( sValue, sVal, iLen ) )
  269. {
  270. tNode.m_eType = int64_t(int(iVal))==iVal ? JSON_INT32 : JSON_INT64;
  271. tNode.m_iValue = iVal;
  272. }
  273. } else
  274. {
  275. tNode.m_eType = JSON_DOUBLE;
  276. tNode.m_fValue = strtod ( sVal, NULL );
  277. }
  278. }
  279. void WriteNode ( JsonNode_t & tNode, const char * sKey=NULL, int iKeyLen=0 )
  280. {
  281. // convert int64 to int32, strings to numbers if needed
  282. NumericFixup ( tNode, m_bAutoconv );
  283. ESphJsonType eType = tNode.m_eType;
  284. // note m_iHandle may be uninitialized on simple nodes
  285. CSphVector<JsonNode_t> & dNodes = ( ( eType==JSON_MIXED_VECTOR || eType==JSON_OBJECT ) && tNode.m_iHandle>=0 )
  286. ? m_dNodes[ tNode.m_iHandle ]
  287. : m_dEmpty;
  288. // process mixed vector, convert to generic vector if possible
  289. if ( eType==JSON_MIXED_VECTOR )
  290. {
  291. ARRAY_FOREACH ( i, dNodes )
  292. NumericFixup ( dNodes[i], m_bAutoconv );
  293. ESphJsonType eBase = dNodes.GetLength()>0 ? dNodes[0].m_eType : JSON_EOF;
  294. bool bGeneric = ARRAY_ALL ( bGeneric, dNodes, dNodes[_all].m_eType==eBase );
  295. if ( bGeneric )
  296. switch ( eBase )
  297. {
  298. case JSON_INT32: eType = JSON_INT32_VECTOR; break;
  299. case JSON_INT64: eType = JSON_INT64_VECTOR; break;
  300. case JSON_DOUBLE: eType = JSON_DOUBLE_VECTOR; break;
  301. case JSON_STRING: eType = JSON_STRING_VECTOR; break;
  302. default: break; // type matches across all entries, but we do not have a special format for that type
  303. }
  304. }
  305. // check for the root (bson v1), note sKey shouldn't be set
  306. if ( eType==JSON_OBJECT && m_dBuffer.GetLength()==4 && !sKey )
  307. eType = JSON_ROOT;
  308. // write node type
  309. if ( eType!=JSON_ROOT )
  310. m_dBuffer.Add ( (BYTE)eType );
  311. // write key if given
  312. if ( sKey )
  313. PackStr ( sKey, iKeyLen );
  314. switch ( eType )
  315. {
  316. // basic types
  317. case JSON_INT32: StoreInt ( (int)tNode.m_iValue ); break;
  318. case JSON_INT64: StoreBigint ( tNode.m_iValue ); break;
  319. case JSON_DOUBLE: StoreBigint ( sphD2QW ( tNode.m_fValue ) ); break;
  320. case JSON_STRING: PackNodeStr ( tNode ); break;
  321. // literals
  322. case JSON_TRUE:
  323. case JSON_FALSE:
  324. case JSON_NULL:
  325. // no content
  326. break;
  327. // associative arrays
  328. case JSON_ROOT:
  329. case JSON_OBJECT:
  330. {
  331. DWORD uMask = 0;
  332. int iOfs = 0;
  333. if ( eType==JSON_OBJECT )
  334. {
  335. iOfs = ReserveSize();
  336. StoreInt ( uMask );
  337. }
  338. ARRAY_FOREACH ( i, dNodes )
  339. {
  340. char * sObjKey = m_pBuf + dNodes[i].m_iKeyStart;
  341. int iLen = KeyUnescape ( &sObjKey, dNodes[i].m_iKeyEnd-dNodes[i].m_iKeyStart );
  342. WriteNode ( dNodes[i], sObjKey, iLen );
  343. uMask |= sphJsonKeyMask ( sObjKey, iLen );
  344. }
  345. m_dBuffer.Add ( JSON_EOF );
  346. if ( eType==JSON_OBJECT )
  347. {
  348. StoreMask ( iOfs+1, uMask );
  349. PackSize ( iOfs ); // MUST be in this order, because PackSize() might move the data!
  350. } else
  351. {
  352. assert ( eType==JSON_ROOT );
  353. StoreMask ( 0, uMask );
  354. }
  355. break;
  356. }
  357. // mixed array
  358. case JSON_MIXED_VECTOR:
  359. {
  360. int iOfs = ReserveSize();
  361. PackInt ( dNodes.GetLength() );
  362. ARRAY_FOREACH ( i, dNodes )
  363. WriteNode ( dNodes[i] );
  364. PackSize ( iOfs );
  365. break;
  366. }
  367. // optimized (generic) arrays
  368. case JSON_INT32_VECTOR:
  369. PackInt ( dNodes.GetLength() );
  370. ARRAY_FOREACH ( i, dNodes )
  371. StoreInt ( (int)dNodes[i].m_iValue );
  372. break;
  373. case JSON_INT64_VECTOR:
  374. PackInt ( dNodes.GetLength() );
  375. ARRAY_FOREACH ( i, dNodes )
  376. StoreBigint ( dNodes[i].m_iValue );
  377. break;
  378. case JSON_DOUBLE_VECTOR:
  379. PackInt ( dNodes.GetLength() );
  380. ARRAY_FOREACH ( i, dNodes )
  381. StoreBigint ( sphD2QW ( dNodes[i].m_fValue ) );
  382. break;
  383. case JSON_STRING_VECTOR:
  384. {
  385. int iOfs = ReserveSize();
  386. PackInt ( dNodes.GetLength() );
  387. ARRAY_FOREACH ( i, dNodes )
  388. PackNodeStr ( dNodes[i] );
  389. PackSize ( iOfs );
  390. break;
  391. }
  392. default:
  393. assert ( 0 && "internal error: unhandled type" );
  394. break;
  395. }
  396. }
  397. void DebugIndent ( int iLevel )
  398. {
  399. for ( int i=0; i<iLevel; i++ )
  400. printf ( " " );
  401. }
  402. void DebugDump ( ESphJsonType eType, const BYTE ** ppData, int iLevel )
  403. {
  404. DebugIndent ( iLevel );
  405. const BYTE * p = *ppData;
  406. switch ( eType )
  407. {
  408. case JSON_INT32: printf ( "JSON_INT32 %d\n", sphJsonLoadInt ( &p ) ); break;
  409. case JSON_INT64: printf ( "JSON_INT64 "INT64_FMT"\n", sphJsonLoadBigint ( &p ) ); break;
  410. case JSON_DOUBLE: printf ( "JSON_DOUBLE %lf\n", sphQW2D ( sphJsonLoadBigint ( &p ) ) ); break;
  411. case JSON_STRING:
  412. {
  413. int iLen = sphJsonUnpackInt ( &p );
  414. CSphString sVal;
  415. sVal.SetBinary ( (const char*)p, iLen );
  416. printf ( "JSON_STRING \"%s\"\n", sVal.cstr() );
  417. p += iLen;
  418. break;
  419. }
  420. case JSON_TRUE: printf ( "JSON_TRUE\n" ); break;
  421. case JSON_FALSE: printf ( "JSON_FALSE\n" ); break;
  422. case JSON_NULL: printf ( "JSON_NULL\n" ); break;
  423. case JSON_EOF: printf ( "JSON_EOF\n" ); break;
  424. // associative arrays
  425. case JSON_ROOT:
  426. case JSON_OBJECT:
  427. {
  428. if ( eType==JSON_OBJECT )
  429. sphJsonUnpackInt ( &p );
  430. DWORD uMask = sphGetDword(p);
  431. printf ( "%s (bloom mask: 0x%08x)\n", eType==JSON_OBJECT ? "JSON_OBJECT" : "JSON_ROOT", uMask );
  432. p += 4; // skip bloom table
  433. for ( ;; )
  434. {
  435. ESphJsonType eInnerType = (ESphJsonType) *p++;
  436. if ( eInnerType==JSON_EOF )
  437. break;
  438. const int iStrLen = sphJsonUnpackInt ( &p );
  439. CSphString sVal;
  440. sVal.SetBinary ( (const char*)p, iStrLen );
  441. DebugIndent ( iLevel+1 );
  442. printf ( "\"%s\"", sVal.cstr() );
  443. p += iStrLen;
  444. DebugDump ( eInnerType, &p, iLevel+1 );
  445. }
  446. break;
  447. }
  448. case JSON_MIXED_VECTOR:
  449. {
  450. int iTotalLen = sphJsonUnpackInt ( &p );
  451. int iLen = sphJsonUnpackInt ( &p );
  452. printf ( "JSON_MIXED_VECTOR [%d] (%d bytes)\n", iLen, iTotalLen );
  453. for ( int i=0; i<iLen; i++ )
  454. {
  455. ESphJsonType eInnerType = (ESphJsonType)*p++;
  456. DebugDump ( eInnerType, &p, iLevel+1 );
  457. }
  458. break;
  459. }
  460. // optimized arrays ( note they can't be empty )
  461. case JSON_STRING_VECTOR:
  462. {
  463. sphJsonUnpackInt ( &p );
  464. int iLen = sphJsonUnpackInt ( &p );
  465. printf ( "JSON_STRING_VECTOR (%d) [", iLen );
  466. for ( int i=0; i<iLen; i++ )
  467. {
  468. int iStrLen = sphJsonUnpackInt ( &p );
  469. CSphString sVal;
  470. sVal.SetBinary ( (const char*)p, iStrLen );
  471. printf ( "\"%s\"%s", sVal.cstr(), i<iLen-1 ? "," : "]\n" );
  472. p += iStrLen;
  473. }
  474. break;
  475. }
  476. case JSON_INT32_VECTOR:
  477. {
  478. int iLen = sphJsonUnpackInt ( &p );
  479. printf ( "JSON_INT32_VECTOR (%d) [", iLen );
  480. for ( int i=0; i<iLen; i++ )
  481. printf ( "%d%s", sphJsonLoadInt ( &p ), i<iLen-1 ? "," : "]\n" );
  482. break;
  483. }
  484. case JSON_INT64_VECTOR:
  485. {
  486. int iLen = sphJsonUnpackInt ( &p );
  487. printf ( "JSON_INT64_VECTOR (%d) [", iLen );
  488. for ( int i=0; i<iLen; i++ )
  489. printf ( INT64_FMT"%s", sphJsonLoadBigint ( &p ), i<iLen-1 ? "," : "]\n" );
  490. break;
  491. }
  492. case JSON_DOUBLE_VECTOR:
  493. {
  494. int iLen = sphJsonUnpackInt ( &p );
  495. printf ( "JSON_DOUBLE_VECTOR (%d) [", iLen );
  496. for ( int i=0; i<iLen; i++ )
  497. printf ( "%lf%s", sphQW2D ( sphJsonLoadBigint ( &p ) ), i<iLen-1 ? "," : "]\n" );
  498. break;
  499. }
  500. default:
  501. printf ( "UNKNOWN\n" );
  502. break;
  503. }
  504. *ppData = p;
  505. }
  506. void DebugDump ( const BYTE * p )
  507. {
  508. CSphVector<BYTE> dOut;
  509. sphJsonFormat ( dOut, m_dBuffer.Begin() );
  510. dOut.Add ( '\0' );
  511. printf ( "sphJsonFormat: %s\n", (char*)dOut.Begin() );
  512. printf ( "Blob size: %d bytes\n", m_dBuffer.GetLength() );
  513. ESphJsonType eType = sphJsonFindFirst ( &p );
  514. DebugDump ( eType, &p, 0 );
  515. printf ( "\n" );
  516. }
  517. };
  518. // unused parameter, simply to avoid type clash between all my yylex() functions
  519. #define YY_NO_UNISTD_H 1
  520. #define YYLEX_PARAM pParser->m_pScanner, pParser
  521. #define YY_DECL int yylex ( YYSTYPE * lvalp, void * yyscanner, JsonParser_c * pParser )
  522. #include "llsphinxjson.c"
  523. void yyerror ( JsonParser_c * pParser, const char * sMessage )
  524. {
  525. yy2lex_unhold ( pParser->m_pScanner );
  526. pParser->m_sError.SetSprintf ( "%s near '%s'", sMessage, pParser->m_pLastToken );
  527. }
  528. #include "yysphinxjson.c"
  529. bool sphJsonParse ( CSphVector<BYTE> & dData, char * sData, bool bAutoconv, bool bToLowercase, CSphString & sError )
  530. {
  531. int iLen = strlen ( sData );
  532. if ( sData[iLen+1]!=0 )
  533. {
  534. sError = "internal error: input data passed to sphJsonParse() must be terminated with a double zero";
  535. return false;
  536. }
  537. JsonParser_c tParser ( dData, bAutoconv, bToLowercase, sError );
  538. yy2lex_init ( &tParser.m_pScanner );
  539. tParser.m_pBuf = sData; // sphJsonParse() is intentionally destructive, no need to copy data here
  540. YY_BUFFER_STATE tLexerBuffer = yy2_scan_buffer ( sData, iLen+2, tParser.m_pScanner );
  541. if ( !tLexerBuffer )
  542. {
  543. sError = "internal error: yy_scan_buffer() failed";
  544. return false;
  545. }
  546. int iRes = yyparse ( &tParser );
  547. yy2_delete_buffer ( tLexerBuffer, tParser.m_pScanner );
  548. yy2lex_destroy ( tParser.m_pScanner );
  549. tParser.Finalize();
  550. return iRes==0;
  551. }
  552. //////////////////////////////////////////////////////////////////////////
  553. DWORD sphJsonKeyMask ( const char * sKey, int iLen )
  554. {
  555. DWORD uCrc = sphCRC32 ( (const BYTE*)sKey, iLen );
  556. return
  557. ( 1UL<<( uCrc & 31 ) ) +
  558. ( 1UL<<( ( uCrc>>8 ) & 31 ) );
  559. }
  560. // returns -1 if size is unreachable (for remote agents)
  561. int sphJsonNodeSize ( ESphJsonType eType, const BYTE *pData )
  562. {
  563. int iLen;
  564. const BYTE * p = pData;
  565. switch ( eType )
  566. {
  567. case JSON_INT32:
  568. return 4;
  569. case JSON_INT64:
  570. case JSON_DOUBLE:
  571. return 8;
  572. case JSON_INT32_VECTOR:
  573. if ( !p )
  574. return -1;
  575. iLen = sphJsonUnpackInt ( &p );
  576. return p - pData + iLen * 4;
  577. case JSON_INT64_VECTOR:
  578. case JSON_DOUBLE_VECTOR:
  579. if ( !p )
  580. return -1;
  581. iLen = sphJsonUnpackInt ( &p );
  582. return p - pData + iLen * 8;
  583. case JSON_STRING:
  584. case JSON_STRING_VECTOR:
  585. case JSON_MIXED_VECTOR:
  586. case JSON_OBJECT:
  587. if ( !p )
  588. return -1;
  589. iLen = sphJsonUnpackInt ( &p );
  590. return p - pData + iLen;
  591. case JSON_ROOT:
  592. if ( !p )
  593. return -1;
  594. p += 4; // skip filter
  595. for ( ;; )
  596. {
  597. ESphJsonType eNode = (ESphJsonType) *p++;
  598. if ( eNode==JSON_EOF )
  599. break;
  600. // skip key and node
  601. iLen = sphJsonUnpackInt ( &p );
  602. p += iLen;
  603. sphJsonSkipNode ( eNode, &p );
  604. }
  605. return p - pData;
  606. default:
  607. return 0;
  608. }
  609. }
  610. void sphJsonSkipNode ( ESphJsonType eType, const BYTE ** ppData )
  611. {
  612. int iSize = sphJsonNodeSize ( eType, *ppData );
  613. *ppData += iSize;
  614. }
  615. int sphJsonFieldLength ( ESphJsonType eType, const BYTE * pData )
  616. {
  617. const BYTE * p = pData;
  618. int iCount = 0;
  619. switch ( eType )
  620. {
  621. case JSON_INT32:
  622. case JSON_INT64:
  623. case JSON_DOUBLE:
  624. return 1;
  625. case JSON_STRING_VECTOR:
  626. case JSON_MIXED_VECTOR:
  627. sphJsonUnpackInt ( &p );
  628. return sphJsonUnpackInt ( &p );
  629. case JSON_INT32_VECTOR:
  630. case JSON_INT64_VECTOR:
  631. case JSON_DOUBLE_VECTOR:
  632. return sphJsonUnpackInt ( &p );
  633. case JSON_OBJECT:
  634. case JSON_ROOT:
  635. if ( eType==JSON_OBJECT )
  636. sphJsonUnpackInt ( &p ); // skip size
  637. p += 4; // skip filter
  638. for ( ;; )
  639. {
  640. ESphJsonType eNode = (ESphJsonType) *p++;
  641. if ( eNode==JSON_EOF )
  642. break;
  643. int iLen = sphJsonUnpackInt ( &p );
  644. p += iLen;
  645. sphJsonSkipNode ( eNode, &p );
  646. iCount++;
  647. }
  648. return iCount;
  649. default:
  650. return 0;
  651. }
  652. }
  653. ESphJsonType sphJsonFindFirst ( const BYTE ** ppData )
  654. {
  655. // non-zero bloom mask? that is JSON_ROOT (basically a JSON_OBJECT without node header)
  656. if ( sphGetDword(*ppData) )
  657. return JSON_ROOT;
  658. // zero mask? must be followed by the type byte (typically JSON_EOF)
  659. ESphJsonType eType = (ESphJsonType)((*ppData)[4]);
  660. *ppData += 5;
  661. return eType;
  662. }
  663. ESphJsonType sphJsonFindByKey ( ESphJsonType eType, const BYTE ** ppValue, const void * pKey, int iLen, DWORD uMask )
  664. {
  665. if ( eType!=JSON_OBJECT && eType!=JSON_ROOT )
  666. return JSON_EOF;
  667. const BYTE * p = *ppValue;
  668. if ( eType==JSON_OBJECT )
  669. sphJsonUnpackInt ( &p );
  670. if ( ( sphGetDword(p) & uMask )!=uMask )
  671. return JSON_EOF;
  672. p += 4;
  673. for ( ;; )
  674. {
  675. eType = (ESphJsonType) *p++;
  676. if ( eType==JSON_EOF )
  677. break;
  678. int iStrLen = sphJsonUnpackInt ( &p );
  679. p += iStrLen;
  680. if ( iStrLen==iLen && !memcmp ( p-iStrLen, pKey, iStrLen ) )
  681. {
  682. *ppValue = p;
  683. return eType;
  684. }
  685. sphJsonSkipNode ( eType, &p );
  686. }
  687. return JSON_EOF;
  688. }
  689. ESphJsonType sphJsonFindByIndex ( ESphJsonType eType, const BYTE ** ppValue, int iIndex )
  690. {
  691. if ( iIndex<0 )
  692. return JSON_EOF;
  693. const BYTE * p = *ppValue;
  694. switch ( eType )
  695. {
  696. case JSON_INT32_VECTOR:
  697. case JSON_INT64_VECTOR:
  698. case JSON_DOUBLE_VECTOR:
  699. {
  700. int iLen = sphJsonUnpackInt ( &p );
  701. if ( iIndex>=iLen )
  702. return JSON_EOF;
  703. p += iIndex * ( eType==JSON_INT32_VECTOR ? 4 : 8 );
  704. *ppValue = p;
  705. return eType==JSON_INT32_VECTOR ? JSON_INT32
  706. : eType==JSON_INT64_VECTOR ? JSON_INT64
  707. : JSON_DOUBLE;
  708. }
  709. case JSON_STRING_VECTOR:
  710. {
  711. sphJsonUnpackInt ( &p );
  712. int iLen = sphJsonUnpackInt ( &p );
  713. if ( iIndex>=iLen )
  714. return JSON_EOF;
  715. for ( int i=0; i<iIndex; i++ )
  716. {
  717. int iStrLen = sphJsonUnpackInt ( &p );
  718. p += iStrLen;
  719. }
  720. *ppValue = p;
  721. return JSON_STRING;
  722. }
  723. case JSON_MIXED_VECTOR:
  724. {
  725. sphJsonUnpackInt ( &p );
  726. int iLen = sphJsonUnpackInt ( &p );
  727. if ( iIndex>=iLen )
  728. return JSON_EOF;
  729. for ( int i=0; i<iIndex; i++ )
  730. {
  731. eType = (ESphJsonType)*p++;
  732. sphJsonSkipNode ( eType, &p );
  733. }
  734. eType = (ESphJsonType)*p;
  735. *ppValue = p+1;
  736. return eType;
  737. }
  738. default:
  739. return JSON_EOF;
  740. break;
  741. }
  742. }
  743. //////////////////////////////////////////////////////////////////////////
  744. static const BYTE * JsonFormatStr ( CSphVector<BYTE> & dOut, const BYTE * p, bool bQuote=true )
  745. {
  746. int iLen = sphJsonUnpackInt ( &p );
  747. dOut.Reserve ( dOut.GetLength()+iLen );
  748. if ( bQuote )
  749. dOut.Add ( '"' );
  750. while ( iLen-- )
  751. {
  752. if ( *p=='"' )
  753. dOut.Add ( '\\' );
  754. dOut.Add ( *p );
  755. p++;
  756. }
  757. if ( bQuote )
  758. dOut.Add ( '"' );
  759. return p;
  760. }
  761. void JsonAddStr ( CSphVector<BYTE> & dOut, const char * pStr )
  762. {
  763. while ( *pStr )
  764. dOut.Add ( *pStr++ );
  765. }
  766. void sphJsonFormat ( CSphVector<BYTE> & dOut, const BYTE * pData )
  767. {
  768. if ( !pData )
  769. return;
  770. ESphJsonType eType = sphJsonFindFirst ( &pData );
  771. // check for the empty root
  772. if ( eType==JSON_EOF )
  773. {
  774. JsonAddStr ( dOut, "{}" );
  775. return;
  776. }
  777. sphJsonFieldFormat ( dOut, pData, eType );
  778. }
  779. const BYTE * sphJsonFieldFormat ( CSphVector<BYTE> & dOut, const BYTE * pData, ESphJsonType eType, bool bQuoteString )
  780. {
  781. const BYTE * p = pData;
  782. // format value
  783. switch ( eType )
  784. {
  785. case JSON_INT32:
  786. {
  787. int iOff = dOut.GetLength();
  788. dOut.Resize ( iOff+32 );
  789. int iLen = snprintf ( (char *)dOut.Begin()+iOff, 32, "%d", sphJsonLoadInt ( &p ) ); // NOLINT
  790. dOut.Resize ( iOff+iLen );
  791. break;
  792. }
  793. case JSON_INT64:
  794. {
  795. int iOff = dOut.GetLength();
  796. dOut.Resize ( iOff+32 );
  797. int iLen = snprintf ( (char *)dOut.Begin()+iOff, 32, INT64_FMT, sphJsonLoadBigint ( &p ) ); // NOLINT
  798. dOut.Resize ( iOff+iLen );
  799. break;
  800. }
  801. case JSON_DOUBLE:
  802. {
  803. int iOff = dOut.GetLength();
  804. dOut.Resize ( iOff+32 );
  805. int iLen = snprintf ( (char *)dOut.Begin()+iOff, 32, "%lf", sphQW2D ( sphJsonLoadBigint ( &p ) ) ); // NOLINT
  806. dOut.Resize ( iOff+iLen );
  807. break;
  808. }
  809. case JSON_STRING:
  810. p = JsonFormatStr ( dOut, p, bQuoteString );
  811. break;
  812. case JSON_STRING_VECTOR:
  813. {
  814. int iLen = sphJsonUnpackInt ( &p );
  815. dOut.Reserve ( dOut.GetLength()+iLen );
  816. int iVals = sphJsonUnpackInt ( &p );
  817. dOut.Add ( '[' );
  818. for ( int i=0; i<iVals; i++ )
  819. {
  820. if ( i>0 )
  821. dOut.Add ( ',' );
  822. p = JsonFormatStr ( dOut, p );
  823. }
  824. dOut.Add ( ']' );
  825. break;
  826. }
  827. case JSON_INT32_VECTOR:
  828. case JSON_INT64_VECTOR:
  829. case JSON_DOUBLE_VECTOR:
  830. {
  831. int iVals = sphJsonUnpackInt ( &p );
  832. dOut.Add ( '[' );
  833. for ( int i=0; i<iVals; i++ )
  834. {
  835. if ( i>0 )
  836. dOut.Add ( ',' );
  837. int iOff = dOut.GetLength();
  838. dOut.Resize ( iOff+32 );
  839. int iLen = 0;
  840. char * b = (char *)dOut.Begin()+iOff;
  841. switch ( eType )
  842. {
  843. case JSON_INT32_VECTOR: iLen = snprintf ( b, 32, "%d", sphJsonLoadInt ( &p ) ); break; // NOLINT
  844. case JSON_INT64_VECTOR: iLen = snprintf ( b, 32, INT64_FMT, sphJsonLoadBigint ( &p ) ); break; // NOLINT
  845. case JSON_DOUBLE_VECTOR: iLen = snprintf ( b, 32, "%lf", sphQW2D ( sphJsonLoadBigint ( &p ) ) ); break; // NOLINT
  846. default:
  847. break;
  848. }
  849. dOut.Resize ( iOff+iLen );
  850. }
  851. dOut.Add ( ']' );
  852. break;
  853. }
  854. case JSON_MIXED_VECTOR:
  855. {
  856. sphJsonUnpackInt ( &p );
  857. int iVals = sphJsonUnpackInt ( &p );
  858. dOut.Add ( '[' );
  859. for ( int i=0; i<iVals; i++ )
  860. {
  861. if ( i>0 )
  862. dOut.Add ( ',' );
  863. ESphJsonType eNode = (ESphJsonType) *p++;
  864. p = sphJsonFieldFormat ( dOut, p, eNode, true );
  865. }
  866. dOut.Add ( ']' );
  867. break;
  868. }
  869. case JSON_ROOT:
  870. case JSON_OBJECT:
  871. {
  872. if ( eType==JSON_OBJECT )
  873. sphJsonUnpackInt ( &p );
  874. p += 4; // skip bloom table
  875. dOut.Add ( '{' );
  876. for ( int i=0;;i++ )
  877. {
  878. ESphJsonType eNode = (ESphJsonType) *p++;
  879. if ( eNode==JSON_EOF )
  880. break;
  881. if ( i>0 )
  882. dOut.Add ( ',' );
  883. p = JsonFormatStr ( dOut, p );
  884. dOut.Add ( ':' );
  885. p = sphJsonFieldFormat ( dOut, p, eNode, true );
  886. }
  887. dOut.Add ( '}' );
  888. break;
  889. }
  890. case JSON_TRUE: JsonAddStr ( dOut, bQuoteString ? "true" : "True" ); break;
  891. case JSON_FALSE: JsonAddStr ( dOut, bQuoteString ? "false" : "False" ); break;
  892. case JSON_NULL: JsonAddStr ( dOut, bQuoteString ? "null" : "" ); break;
  893. case JSON_EOF: break;
  894. case JSON_TOTAL: break;
  895. }
  896. return p;
  897. }
  898. bool sphJsonNameSplit ( const char * sName, CSphString * sColumn, CSphString * sKey )
  899. {
  900. if ( !sName )
  901. return false;
  902. // find either '[' or '.', what comes first
  903. const char * pSep = sName;
  904. while ( *pSep && *pSep!='.' && *pSep!='[' )
  905. pSep++;
  906. if ( !*pSep )
  907. return false;
  908. int iSep = pSep - sName;
  909. if ( sColumn )
  910. {
  911. sColumn->SetBinary ( sName, iSep );
  912. sColumn->Trim();
  913. }
  914. if ( sKey )
  915. *sKey = sName + iSep + ( *pSep=='.' ? 1 : 0 );
  916. return true;
  917. }
  918. JsonKey_t::JsonKey_t ()
  919. : m_uMask ( 0 )
  920. , m_iLen ( 0 )
  921. {}
  922. JsonKey_t::JsonKey_t ( const char * sKey, int iLen )
  923. {
  924. m_iLen = iLen;
  925. m_uMask = sphJsonKeyMask ( sKey, m_iLen );
  926. m_sKey.SetBinary ( sKey, m_iLen );
  927. }
  928. void JsonStoreInt ( BYTE * p, int v )
  929. {
  930. *p++ = BYTE(DWORD(v));
  931. *p++ = BYTE(DWORD(v) >> 8);
  932. *p++ = BYTE(DWORD(v) >> 16);
  933. *p++ = BYTE(DWORD(v) >> 24);
  934. }
  935. void JsonStoreBigint ( BYTE * p, int64_t v )
  936. {
  937. JsonStoreInt ( p, (DWORD)( v & 0xffffffffUL ) );
  938. JsonStoreInt ( p+4, (int)( v>>32 ) );
  939. }
  940. bool sphJsonInplaceUpdate ( ESphJsonType eValueType, int64_t iValue, ISphExpr * pExpr, BYTE * pStrings, const CSphRowitem * pRow, bool bUpdate )
  941. {
  942. if ( !pExpr || !pStrings )
  943. return false;
  944. pExpr->Command ( SPH_EXPR_SET_STRING_POOL, (void*)pStrings );
  945. CSphMatch tMatch;
  946. tMatch.m_pStatic = pRow;
  947. uint64_t uPacked = pExpr->Int64Eval ( tMatch );
  948. BYTE * pData = pStrings + ( uPacked & 0xffffffff );
  949. ESphJsonType eType = (ESphJsonType)( uPacked >> 32 );
  950. switch ( eType )
  951. {
  952. case JSON_INT32:
  953. if ( eValueType==JSON_DOUBLE )
  954. iValue = (int64_t)sphQW2D ( iValue );
  955. if ( int64_t(int(iValue))!=iValue )
  956. return false;
  957. if ( bUpdate )
  958. JsonStoreInt ( pData, (int)iValue );
  959. break;
  960. case JSON_INT64:
  961. if ( bUpdate )
  962. JsonStoreBigint ( pData, eValueType==JSON_DOUBLE ? (int64_t)sphQW2D ( iValue ) : iValue );
  963. break;
  964. case JSON_DOUBLE:
  965. if ( bUpdate )
  966. JsonStoreBigint ( pData, eValueType==JSON_DOUBLE ? iValue : sphD2QW ( (double)iValue ) );
  967. break;
  968. default:
  969. return false;
  970. }
  971. return true;
  972. }
  973. //
  974. // $Id$
  975. //