PageRenderTime 62ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/library/Library/WP7/SQLiteDriver/sqlite/wal_c.cs

https://bitbucket.org/digitalizarte/coolstorage
C# | 2751 lines | 1384 code | 225 blank | 1142 comment | 244 complexity | c71f94acbce0b78bc9e2a56f8e0c2e0a MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. using System;
  2. using System.Diagnostics;
  3. using System.Text;
  4. using Bitmask = System.UInt64;
  5. using u32 = System.UInt32;
  6. namespace Community.CsharpSqlite
  7. {
  8. public partial class Sqlite3
  9. {
  10. /*
  11. ** 2010 February 1
  12. **
  13. ** The author disclaims copyright to this source code. In place of
  14. ** a legal notice, here is a blessing:
  15. **
  16. ** May you do good and not evil.
  17. ** May you find forgiveness for yourself and forgive others.
  18. ** May you share freely, never taking more than you give.
  19. **
  20. *************************************************************************
  21. **
  22. ** This file contains the implementation of a write-ahead log (WAL) used in
  23. ** "journal_mode=WAL" mode.
  24. **
  25. ** WRITE-AHEAD LOG (WAL) FILE FORMAT
  26. **
  27. ** A WAL file consists of a header followed by zero or more "frames".
  28. ** Each frame records the revised content of a single page from the
  29. ** database file. All changes to the database are recorded by writing
  30. ** frames into the WAL. Transactions commit when a frame is written that
  31. ** contains a commit marker. A single WAL can and usually does record
  32. ** multiple transactions. Periodically, the content of the WAL is
  33. ** transferred back into the database file in an operation called a
  34. ** "checkpoint".
  35. **
  36. ** A single WAL file can be used multiple times. In other words, the
  37. ** WAL can fill up with frames and then be checkpointed and then new
  38. ** frames can overwrite the old ones. A WAL always grows from beginning
  39. ** toward the end. Checksums and counters attached to each frame are
  40. ** used to determine which frames within the WAL are valid and which
  41. ** are leftovers from prior checkpoints.
  42. **
  43. ** The WAL header is 32 bytes in size and consists of the following eight
  44. ** big-endian 32-bit unsigned integer values:
  45. **
  46. ** 0: Magic number. 0x377f0682 or 0x377f0683
  47. ** 4: File format version. Currently 3007000
  48. ** 8: Database page size. Example: 1024
  49. ** 12: Checkpoint sequence number
  50. ** 16: Salt-1, random integer incremented with each checkpoint
  51. ** 20: Salt-2, a different random integer changing with each ckpt
  52. ** 24: Checksum-1 (first part of checksum for first 24 bytes of header).
  53. ** 28: Checksum-2 (second part of checksum for first 24 bytes of header).
  54. **
  55. ** Immediately following the wal-header are zero or more frames. Each
  56. ** frame consists of a 24-byte frame-header followed by a <page-size> bytes
  57. ** of page data. The frame-header is six big-endian 32-bit unsigned
  58. ** integer values, as follows:
  59. **
  60. ** 0: Page number.
  61. ** 4: For commit records, the size of the database image in pages
  62. ** after the commit. For all other records, zero.
  63. ** 8: Salt-1 (copied from the header)
  64. ** 12: Salt-2 (copied from the header)
  65. ** 16: Checksum-1.
  66. ** 20: Checksum-2.
  67. **
  68. ** A frame is considered valid if and only if the following conditions are
  69. ** true:
  70. **
  71. ** (1) The salt-1 and salt-2 values in the frame-header match
  72. ** salt values in the wal-header
  73. **
  74. ** (2) The checksum values in the final 8 bytes of the frame-header
  75. ** exactly match the checksum computed consecutively on the
  76. ** WAL header and the first 8 bytes and the content of all frames
  77. ** up to and including the current frame.
  78. **
  79. ** The checksum is computed using 32-bit big-endian integers if the
  80. ** magic number in the first 4 bytes of the WAL is 0x377f0683 and it
  81. ** is computed using little-endian if the magic number is 0x377f0682.
  82. ** The checksum values are always stored in the frame header in a
  83. ** big-endian format regardless of which byte order is used to compute
  84. ** the checksum. The checksum is computed by interpreting the input as
  85. ** an even number of unsigned 32-bit integers: x[0] through x[N]. The
  86. ** algorithm used for the checksum is as follows:
  87. **
  88. ** for i from 0 to n-1 step 2:
  89. ** s0 += x[i] + s1;
  90. ** s1 += x[i+1] + s0;
  91. ** endfor
  92. **
  93. ** Note that s0 and s1 are both weighted checksums using fibonacci weights
  94. ** in reverse order (the largest fibonacci weight occurs on the first element
  95. ** of the sequence being summed.) The s1 value spans all 32-bit
  96. ** terms of the sequence whereas s0 omits the final term.
  97. **
  98. ** On a checkpoint, the WAL is first VFS.xSync-ed, then valid content of the
  99. ** WAL is transferred into the database, then the database is VFS.xSync-ed.
  100. ** The VFS.xSync operations serve as write barriers - all writes launched
  101. ** before the xSync must complete before any write that launches after the
  102. ** xSync begins.
  103. **
  104. ** After each checkpoint, the salt-1 value is incremented and the salt-2
  105. ** value is randomized. This prevents old and new frames in the WAL from
  106. ** being considered valid at the same time and being checkpointing together
  107. ** following a crash.
  108. **
  109. ** READER ALGORITHM
  110. **
  111. ** To read a page from the database (call it page number P), a reader
  112. ** first checks the WAL to see if it contains page P. If so, then the
  113. ** last valid instance of page P that is a followed by a commit frame
  114. ** or is a commit frame itself becomes the value read. If the WAL
  115. ** contains no copies of page P that are valid and which are a commit
  116. ** frame or are followed by a commit frame, then page P is read from
  117. ** the database file.
  118. **
  119. ** To start a read transaction, the reader records the index of the last
  120. ** valid frame in the WAL. The reader uses this recorded "mxFrame" value
  121. ** for all subsequent read operations. New transactions can be appended
  122. ** to the WAL, but as long as the reader uses its original mxFrame value
  123. ** and ignores the newly appended content, it will see a consistent snapshot
  124. ** of the database from a single point in time. This technique allows
  125. ** multiple concurrent readers to view different versions of the database
  126. ** content simultaneously.
  127. **
  128. ** The reader algorithm in the previous paragraphs works correctly, but
  129. ** because frames for page P can appear anywhere within the WAL, the
  130. ** reader has to scan the entire WAL looking for page P frames. If the
  131. ** WAL is large (multiple megabytes is typical) that scan can be slow,
  132. ** and read performance suffers. To overcome this problem, a separate
  133. ** data structure called the wal-index is maintained to expedite the
  134. ** search for frames of a particular page.
  135. **
  136. ** WAL-INDEX FORMAT
  137. **
  138. ** Conceptually, the wal-index is shared memory, though VFS implementations
  139. ** might choose to implement the wal-index using a mmapped file. Because
  140. ** the wal-index is shared memory, SQLite does not support journal_mode=WAL
  141. ** on a network filesystem. All users of the database must be able to
  142. ** share memory.
  143. **
  144. ** The wal-index is transient. After a crash, the wal-index can (and should
  145. ** be) reconstructed from the original WAL file. In fact, the VFS is required
  146. ** to either truncate or zero the header of the wal-index when the last
  147. ** connection to it closes. Because the wal-index is transient, it can
  148. ** use an architecture-specific format; it does not have to be cross-platform.
  149. ** Hence, unlike the database and WAL file formats which store all values
  150. ** as big endian, the wal-index can store multi-byte values in the native
  151. ** byte order of the host computer.
  152. **
  153. ** The purpose of the wal-index is to answer this question quickly: Given
  154. ** a page number P, return the index of the last frame for page P in the WAL,
  155. ** or return NULL if there are no frames for page P in the WAL.
  156. **
  157. ** The wal-index consists of a header region, followed by an one or
  158. ** more index blocks.
  159. **
  160. ** The wal-index header contains the total number of frames within the WAL
  161. ** in the the mxFrame field.
  162. **
  163. ** Each index block except for the first contains information on
  164. ** HASHTABLE_NPAGE frames. The first index block contains information on
  165. ** HASHTABLE_NPAGE_ONE frames. The values of HASHTABLE_NPAGE_ONE and
  166. ** HASHTABLE_NPAGE are selected so that together the wal-index header and
  167. ** first index block are the same size as all other index blocks in the
  168. ** wal-index.
  169. **
  170. ** Each index block contains two sections, a page-mapping that contains the
  171. ** database page number associated with each wal frame, and a hash-table
  172. ** that allows readers to query an index block for a specific page number.
  173. ** The page-mapping is an array of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE
  174. ** for the first index block) 32-bit page numbers. The first entry in the
  175. ** first index-block contains the database page number corresponding to the
  176. ** first frame in the WAL file. The first entry in the second index block
  177. ** in the WAL file corresponds to the (HASHTABLE_NPAGE_ONE+1)th frame in
  178. ** the log, and so on.
  179. **
  180. ** The last index block in a wal-index usually contains less than the full
  181. ** complement of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE) page-numbers,
  182. ** depending on the contents of the WAL file. This does not change the
  183. ** allocated size of the page-mapping array - the page-mapping array merely
  184. ** contains unused entries.
  185. **
  186. ** Even without using the hash table, the last frame for page P
  187. ** can be found by scanning the page-mapping sections of each index block
  188. ** starting with the last index block and moving toward the first, and
  189. ** within each index block, starting at the end and moving toward the
  190. ** beginning. The first entry that equals P corresponds to the frame
  191. ** holding the content for that page.
  192. **
  193. ** The hash table consists of HASHTABLE_NSLOT 16-bit unsigned integers.
  194. ** HASHTABLE_NSLOT = 2*HASHTABLE_NPAGE, and there is one entry in the
  195. ** hash table for each page number in the mapping section, so the hash
  196. ** table is never more than half full. The expected number of collisions
  197. ** prior to finding a match is 1. Each entry of the hash table is an
  198. ** 1-based index of an entry in the mapping section of the same
  199. ** index block. Let K be the 1-based index of the largest entry in
  200. ** the mapping section. (For index blocks other than the last, K will
  201. ** always be exactly HASHTABLE_NPAGE (4096) and for the last index block
  202. ** K will be (mxFrame%HASHTABLE_NPAGE).) Unused slots of the hash table
  203. ** contain a value of 0.
  204. **
  205. ** To look for page P in the hash table, first compute a hash iKey on
  206. ** P as follows:
  207. **
  208. ** iKey = (P * 383) % HASHTABLE_NSLOT
  209. **
  210. ** Then start scanning entries of the hash table, starting with iKey
  211. ** (wrapping around to the beginning when the end of the hash table is
  212. ** reached) until an unused hash slot is found. Let the first unused slot
  213. ** be at index iUnused. (iUnused might be less than iKey if there was
  214. ** wrap-around.) Because the hash table is never more than half full,
  215. ** the search is guaranteed to eventually hit an unused entry. Let
  216. ** iMax be the value between iKey and iUnused, closest to iUnused,
  217. ** where aHash[iMax]==P. If there is no iMax entry (if there exists
  218. ** no hash slot such that aHash[i]==p) then page P is not in the
  219. ** current index block. Otherwise the iMax-th mapping entry of the
  220. ** current index block corresponds to the last entry that references
  221. ** page P.
  222. **
  223. ** A hash search begins with the last index block and moves toward the
  224. ** first index block, looking for entries corresponding to page P. On
  225. ** average, only two or three slots in each index block need to be
  226. ** examined in order to either find the last entry for page P, or to
  227. ** establish that no such entry exists in the block. Each index block
  228. ** holds over 4000 entries. So two or three index blocks are sufficient
  229. ** to cover a typical 10 megabyte WAL file, assuming 1K pages. 8 or 10
  230. ** comparisons (on average) suffice to either locate a frame in the
  231. ** WAL or to establish that the frame does not exist in the WAL. This
  232. ** is much faster than scanning the entire 10MB WAL.
  233. **
  234. ** Note that entries are added in order of increasing K. Hence, one
  235. ** reader might be using some value K0 and a second reader that started
  236. ** at a later time (after additional transactions were added to the WAL
  237. ** and to the wal-index) might be using a different value K1, where K1>K0.
  238. ** Both readers can use the same hash table and mapping section to get
  239. ** the correct result. There may be entries in the hash table with
  240. ** K>K0 but to the first reader, those entries will appear to be unused
  241. ** slots in the hash table and so the first reader will get an answer as
  242. ** if no values greater than K0 had ever been inserted into the hash table
  243. ** in the first place - which is what reader one wants. Meanwhile, the
  244. ** second reader using K1 will see additional values that were inserted
  245. ** later, which is exactly what reader two wants.
  246. **
  247. ** When a rollback occurs, the value of K is decreased. Hash table entries
  248. ** that correspond to frames greater than the new K value are removed
  249. ** from the hash table at this point.
  250. *************************************************************************
  251. ** Included in SQLite3 port to C#-SQLite; 2008 Noah B Hart
  252. ** C#-SQLite is an independent reimplementation of the SQLite software library
  253. **
  254. ** SQLITE_SOURCE_ID: 2010-12-07 20:14:09 a586a4deeb25330037a49df295b36aaf624d0f45
  255. **
  256. *************************************************************************
  257. */
  258. #if !SQLITE_OMIT_WAL
  259. //#include "wal.h"
  260. /*
  261. ** Trace output macros
  262. */
  263. #if (SQLITE_TEST) && (SQLITE_DEBUG)
  264. int sqlite3WalTrace = 0;
  265. //# define WALTRACE(X) if(sqlite3WalTrace) sqlite3DebugPrintf X
  266. static void WALTRACE(params object[] X)
  267. {
  268. if(sqlite3WalTrace) sqlite3DebugPrintf(X);
  269. }
  270. #else
  271. //# define WALTRACE(X)
  272. static void WALTRACE(params object[] X) {}
  273. #endif
  274. /*
  275. ** The maximum (and only) versions of the wal and wal-index formats
  276. ** that may be interpreted by this version of SQLite.
  277. **
  278. ** If a client begins recovering a WAL file and finds that (a) the checksum
  279. ** values in the wal-header are correct and (b) the version field is not
  280. ** WAL_MAX_VERSION, recovery fails and SQLite returns SQLITE_CANTOPEN.
  281. **
  282. ** Similarly, if a client successfully reads a wal-index header (i.e. the
  283. ** checksum test is successful) and finds that the version field is not
  284. ** WALINDEX_MAX_VERSION, then no read-transaction is opened and SQLite
  285. ** returns SQLITE_CANTOPEN.
  286. */
  287. //#define WAL_MAX_VERSION 3007000
  288. //#define WALINDEX_MAX_VERSION 3007000
  289. const int WAL_MAX_VERSION = 3007000;
  290. const int WALINDEX_MAX_VERSION = 3007000;
  291. /*
  292. ** Indices of various locking bytes. WAL_NREADER is the number
  293. ** of available reader locks and should be at least 3.
  294. */
  295. //#define WAL_WRITE_LOCK 0
  296. //#define WAL_ALL_BUT_WRITE 1
  297. //#define WAL_CKPT_LOCK 1
  298. //#define WAL_RECOVER_LOCK 2
  299. //#define WAL_READ_LOCK(I) (3+(I))
  300. //#define WAL_NREADER (SQLITE_SHM_NLOCK-3)
  301. const int WAL_WRITE_LOCK = 0;
  302. const int WAL_ALL_BUT_WRITE = 1;
  303. const int WAL_CKPT_LOCK = 1;
  304. const int WAL_RECOVER_LOCK = 2;
  305. const int WAL_READ_LOCK(I) = (3+(I));
  306. const int WAL_NREADER = (SQLITE_SHM_NLOCK-3);
  307. /* Object declarations */
  308. typedef struct WalIndexHdr WalIndexHdr;
  309. typedef struct WalIterator WalIterator;
  310. typedef struct WalCkptInfo WalCkptInfo;
  311. /*
  312. ** The following object holds a copy of the wal-index header content.
  313. **
  314. ** The actual header in the wal-index consists of two copies of this
  315. ** object.
  316. */
  317. struct WalIndexHdr {
  318. u32 iVersion; /* Wal-index version */
  319. u32 unused; /* Unused (padding) field */
  320. u32 iChange; /* Counter incremented each transaction */
  321. u8 isInit; /* 1 when initialized */
  322. u8 bigEndCksum; /* True if checksums in WAL are big-endian */
  323. u16 szPage; /* Database page size in bytes */
  324. u32 mxFrame; /* Index of last valid frame in the WAL */
  325. u32 nPage; /* Size of database in pages */
  326. u32 aFrameCksum[2]; /* Checksum of last frame in log */
  327. u32 aSalt[2]; /* Two salt values copied from WAL header */
  328. u32 aCksum[2]; /* Checksum over all prior fields */
  329. };
  330. /*
  331. ** A copy of the following object occurs in the wal-index immediately
  332. ** following the second copy of the WalIndexHdr. This object stores
  333. ** information used by checkpoint.
  334. **
  335. ** nBackfill is the number of frames in the WAL that have been written
  336. ** back into the database. (We call the act of moving content from WAL to
  337. ** database "backfilling".) The nBackfill number is never greater than
  338. ** WalIndexHdr.mxFrame. nBackfill can only be increased by threads
  339. ** holding the WAL_CKPT_LOCK lock (which includes a recovery thread).
  340. ** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from
  341. ** mxFrame back to zero when the WAL is reset.
  342. **
  343. ** There is one entry in aReadMark[] for each reader lock. If a reader
  344. ** holds read-lock K, then the value in aReadMark[K] is no greater than
  345. ** the mxFrame for that reader. The value READMARK_NOT_USED (0xffffffff)
  346. ** for any aReadMark[] means that entry is unused. aReadMark[0] is
  347. ** a special case; its value is never used and it exists as a place-holder
  348. ** to avoid having to offset aReadMark[] indexs by one. Readers holding
  349. ** WAL_READ_LOCK(0) always ignore the entire WAL and read all content
  350. ** directly from the database.
  351. **
  352. ** The value of aReadMark[K] may only be changed by a thread that
  353. ** is holding an exclusive lock on WAL_READ_LOCK(K). Thus, the value of
  354. ** aReadMark[K] cannot changed while there is a reader is using that mark
  355. ** since the reader will be holding a shared lock on WAL_READ_LOCK(K).
  356. **
  357. ** The checkpointer may only transfer frames from WAL to database where
  358. ** the frame numbers are less than or equal to every aReadMark[] that is
  359. ** in use (that is, every aReadMark[j] for which there is a corresponding
  360. ** WAL_READ_LOCK(j)). New readers (usually) pick the aReadMark[] with the
  361. ** largest value and will increase an unused aReadMark[] to mxFrame if there
  362. ** is not already an aReadMark[] equal to mxFrame. The exception to the
  363. ** previous sentence is when nBackfill equals mxFrame (meaning that everything
  364. ** in the WAL has been backfilled into the database) then new readers
  365. ** will choose aReadMark[0] which has value 0 and hence such reader will
  366. ** get all their all content directly from the database file and ignore
  367. ** the WAL.
  368. **
  369. ** Writers normally append new frames to the end of the WAL. However,
  370. ** if nBackfill equals mxFrame (meaning that all WAL content has been
  371. ** written back into the database) and if no readers are using the WAL
  372. ** (in other words, if there are no WAL_READ_LOCK(i) where i>0) then
  373. ** the writer will first "reset" the WAL back to the beginning and start
  374. ** writing new content beginning at frame 1.
  375. **
  376. ** We assume that 32-bit loads are atomic and so no locks are needed in
  377. ** order to read from any aReadMark[] entries.
  378. */
  379. struct WalCkptInfo {
  380. u32 nBackfill; /* Number of WAL frames backfilled into DB */
  381. u32 aReadMark[WAL_NREADER]; /* Reader marks */
  382. };
  383. //#define READMARK_NOT_USED 0xffffffff
  384. const int READMARK_NOT_USED = 0xffffffff;
  385. /* A block of WALINDEX_LOCK_RESERVED bytes beginning at
  386. ** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems
  387. ** only support mandatory file-locks, we do not read or write data
  388. ** from the region of the file on which locks are applied.
  389. */
  390. //#define WALINDEX_LOCK_OFFSET (sizeof(WalIndexHdr)*2 + sizeof(WalCkptInfo))
  391. //#define WALINDEX_LOCK_RESERVED 16
  392. //#define WALINDEX_HDR_SIZE (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)
  393. const int WALINDEX_LOCK_OFFSET = (sizeof(WalIndexHdr)*2 + sizeof(WalCkptInfo));
  394. const int WALINDEX_LOCK_RESERVED= 16;
  395. const int WALINDEX_HDR_SIZE = (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED);
  396. /* Size of header before each frame in wal */
  397. //#define WAL_FRAME_HDRSIZE 24
  398. const int WAL_FRAME_HDRSIZE =24;
  399. /* Size of write ahead log header, including checksum. */
  400. /* #define WAL_HDRSIZE 24 */
  401. //#define WAL_HDRSIZE 32
  402. const int WAL_HDRSIZE =32;
  403. /* WAL magic value. Either this value, or the same value with the least
  404. ** significant bit also set (WAL_MAGIC | 0x00000001) is stored in 32-bit
  405. ** big-endian format in the first 4 bytes of a WAL file.
  406. **
  407. ** If the LSB is set, then the checksums for each frame within the WAL
  408. ** file are calculated by treating all data as an array of 32-bit
  409. ** big-endian words. Otherwise, they are calculated by interpreting
  410. ** all data as 32-bit little-endian words.
  411. */
  412. //#define WAL_MAGIC 0x377f0682
  413. const int WAL_MAGIC = 0x377f0682;
  414. /*
  415. ** Return the offset of frame iFrame in the write-ahead log file,
  416. ** assuming a database page size of szPage bytes. The offset returned
  417. ** is to the start of the write-ahead log frame-header.
  418. */
  419. //#define walFrameOffset(iFrame, szPage) ( \
  420. // WAL_HDRSIZE + ((iFrame)-1)*(i64)((szPage)+WAL_FRAME_HDRSIZE) \
  421. //)
  422. static int walFrameOffset(iFrame, szPage) {
  423. return WAL_HDRSIZE + ((iFrame)-1)*(i64)((szPage)+WAL_FRAME_HDRSIZE);
  424. }
  425. /*
  426. ** An open write-ahead log file is represented by an instance of the
  427. ** following object.
  428. */
  429. struct Wal {
  430. sqlite3_vfs *pVfs; /* The VFS used to create pDbFd */
  431. sqlite3_file *pDbFd; /* File handle for the database file */
  432. sqlite3_file *pWalFd; /* File handle for WAL file */
  433. u32 iCallback; /* Value to pass to log callback (or 0) */
  434. int nWiData; /* Size of array apWiData */
  435. volatile u32 **apWiData; /* Pointer to wal-index content in memory */
  436. u16 szPage; /* Database page size */
  437. i16 readLock; /* Which read lock is being held. -1 for none */
  438. u8 exclusiveMode; /* Non-zero if connection is in exclusive mode */
  439. u8 writeLock; /* True if in a write transaction */
  440. u8 ckptLock; /* True if holding a checkpoint lock */
  441. u8 readOnly; /* True if the WAL file is open read-only */
  442. WalIndexHdr hdr; /* Wal-index header for current transaction */
  443. const char *zWalName; /* Name of WAL file */
  444. u32 nCkpt; /* Checkpoint sequence counter in the wal-header */
  445. #if SQLITE_DEBUG
  446. u8 lockError; /* True if a locking error has occurred */
  447. #endif
  448. };
  449. /*
  450. ** Each page of the wal-index mapping contains a hash-table made up of
  451. ** an array of HASHTABLE_NSLOT elements of the following type.
  452. */
  453. typedef u16 ht_slot;
  454. /*
  455. ** This structure is used to implement an iterator that loops through
  456. ** all frames in the WAL in database page order. Where two or more frames
  457. ** correspond to the same database page, the iterator visits only the
  458. ** frame most recently written to the WAL (in other words, the frame with
  459. ** the largest index).
  460. **
  461. ** The internals of this structure are only accessed by:
  462. **
  463. ** walIteratorInit() - Create a new iterator,
  464. ** walIteratorNext() - Step an iterator,
  465. ** walIteratorFree() - Free an iterator.
  466. **
  467. ** This functionality is used by the checkpoint code (see walCheckpoint()).
  468. */
  469. struct WalIterator {
  470. int iPrior; /* Last result returned from the iterator */
  471. int nSegment; /* Size of the aSegment[] array */
  472. struct WalSegment {
  473. int iNext; /* Next slot in aIndex[] not yet returned */
  474. ht_slot *aIndex; /* i0, i1, i2... such that aPgno[iN] ascend */
  475. u32 *aPgno; /* Array of page numbers. */
  476. int nEntry; /* Max size of aPgno[] and aIndex[] arrays */
  477. int iZero; /* Frame number associated with aPgno[0] */
  478. } aSegment[1]; /* One for every 32KB page in the WAL */
  479. };
  480. /*
  481. ** Define the parameters of the hash tables in the wal-index file. There
  482. ** is a hash-table following every HASHTABLE_NPAGE page numbers in the
  483. ** wal-index.
  484. **
  485. ** Changing any of these constants will alter the wal-index format and
  486. ** create incompatibilities.
  487. */
  488. //#define HASHTABLE_NPAGE 4096 /* Must be power of 2 */
  489. //#define HASHTABLE_HASH_1 383 /* Should be prime */
  490. //#define HASHTABLE_NSLOT (HASHTABLE_NPAGE*2) /* Must be a power of 2 */
  491. const int HASHTABLE_NPAGE = 4096 ;
  492. const int HASHTABLE_HASH_1 = 383 ;
  493. const int HASHTABLE_NSLOT = (HASHTABLE_NPAGE*2);
  494. /*
  495. ** The block of page numbers associated with the first hash-table in a
  496. ** wal-index is smaller than usual. This is so that there is a complete
  497. ** hash-table on each aligned 32KB page of the wal-index.
  498. */
  499. //#define HASHTABLE_NPAGE_ONE (HASHTABLE_NPAGE - (WALINDEX_HDR_SIZE/sizeof(u32)))
  500. const int HASHTABLE_NPAGE_ONE =(HASHTABLE_NPAGE - (WALINDEX_HDR_SIZE/sizeof(u32)));
  501. /* The wal-index is divided into pages of WALINDEX_PGSZ bytes each. */
  502. //#define WALINDEX_PGSZ ( \
  503. // sizeof(ht_slot)*HASHTABLE_NSLOT + HASHTABLE_NPAGE*sizeof(u32) \
  504. //)
  505. static int WALINDEX_PGSZ(){
  506. return sizeof(ht_slot)*HASHTABLE_NSLOT + HASHTABLE_NPAGE*sizeof(u32);
  507. }
  508. /*
  509. ** Obtain a pointer to the iPage'th page of the wal-index. The wal-index
  510. ** is broken into pages of WALINDEX_PGSZ bytes. Wal-index pages are
  511. ** numbered from zero.
  512. **
  513. ** If this call is successful, *ppPage is set to point to the wal-index
  514. ** page and SQLITE_OK is returned. If an error (an OOM or VFS error) occurs,
  515. ** then an SQLite error code is returned and *ppPage is set to 0.
  516. */
  517. static int walIndexPage(Wal *pWal, int iPage, volatile u32 **ppPage){
  518. int rc = SQLITE_OK;
  519. /* Enlarge the pWal.apWiData[] array if required */
  520. if( pWal.nWiData<=iPage ){
  521. int nByte = sizeof(u32*)*(iPage+1);
  522. volatile u32 **apNew;
  523. apNew = (volatile u32 **)sqlite3_realloc((void *)pWal.apWiData, nByte);
  524. if( !apNew ){
  525. *ppPage = 0;
  526. return SQLITE_NOMEM;
  527. }
  528. memset((void*)&apNew[pWal.nWiData], 0,
  529. sizeof(u32*)*(iPage+1-pWal.nWiData));
  530. pWal.apWiData = apNew;
  531. pWal.nWiData = iPage+1;
  532. }
  533. /* Request a pointer to the required page from the VFS */
  534. if( pWal.apWiData[iPage]==0 ){
  535. if( pWal.exclusiveMode==WAL_HEAPMEMORY_MODE ){
  536. pWal.apWiData[iPage] = (u32 volatile *)sqlite3MallocZero(WALINDEX_PGSZ);
  537. if( !pWal.apWiData[iPage] ) rc = SQLITE_NOMEM;
  538. }else{
  539. rc = sqlite3OsShmMap(pWal.pDbFd, iPage, WALINDEX_PGSZ,
  540. pWal.writeLock, (void volatile **)&pWal.apWiData[iPage]
  541. );
  542. }
  543. }
  544. *ppPage = pWal.apWiData[iPage];
  545. Debug.Assert( iPage==0 || *ppPage || rc!=SQLITE_OK );
  546. return rc;
  547. }
  548. /*
  549. ** Return a pointer to the WalCkptInfo structure in the wal-index.
  550. */
  551. static volatile WalCkptInfo *walCkptInfo(Wal *pWal){
  552. Debug.Assert( pWal.nWiData>0 && pWal.apWiData[0] );
  553. return (volatile WalCkptInfo*)&(pWal.apWiData[0][sizeof(WalIndexHdr)/2]);
  554. }
  555. /*
  556. ** Return a pointer to the WalIndexHdr structure in the wal-index.
  557. */
  558. static volatile WalIndexHdr *walIndexHdr(Wal *pWal){
  559. Debug.Assert( pWal.nWiData>0 && pWal.apWiData[0] );
  560. return (volatile WalIndexHdr*)pWal.apWiData[0];
  561. }
  562. /*
  563. ** The argument to this macro must be of type u32. On a little-endian
  564. ** architecture, it returns the u32 value that results from interpreting
  565. ** the 4 bytes as a big-endian value. On a big-endian architecture, it
  566. ** returns the value that would be produced by intepreting the 4 bytes
  567. ** of the input value as a little-endian integer.
  568. */
  569. //#define BYTESWAP32(x) ( \
  570. // (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8) \
  571. // + (((x)&0x00FF0000)>>8) + (((x)&0xFF000000)>>24) \
  572. //)
  573. static int BYTESWAP32(int x) { return
  574. (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8)
  575. + (((x)&0x00FF0000)>>8) + (((x)&0xFF000000)>>24) ;
  576. }
  577. /*
  578. ** Generate or extend an 8 byte checksum based on the data in
  579. ** array aByte[] and the initial values of aIn[0] and aIn[1] (or
  580. ** initial values of 0 and 0 if aIn==NULL).
  581. **
  582. ** The checksum is written back into aOut[] before returning.
  583. **
  584. ** nByte must be a positive multiple of 8.
  585. */
  586. static void walChecksumBytes(
  587. int nativeCksum, /* True for native byte-order, false for non-native */
  588. u8 *a, /* Content to be checksummed */
  589. int nByte, /* Bytes of content in a[]. Must be a multiple of 8. */
  590. const u32 *aIn, /* Initial checksum value input */
  591. u32 *aOut /* OUT: Final checksum value output */
  592. ){
  593. u32 s1, s2;
  594. u32 *aData = (u32 *)a;
  595. u32 *aEnd = (u32 *)&a[nByte];
  596. if( aIn ){
  597. s1 = aIn[0];
  598. s2 = aIn[1];
  599. }else{
  600. s1 = s2 = 0;
  601. }
  602. Debug.Assert( nByte>=8 );
  603. Debug.Assert( (nByte&0x00000007)==0 );
  604. if( nativeCksum ){
  605. do {
  606. s1 += *aData++ + s2;
  607. s2 += *aData++ + s1;
  608. }while( aData<aEnd );
  609. }else{
  610. do {
  611. s1 += BYTESWAP32(aData[0]) + s2;
  612. s2 += BYTESWAP32(aData[1]) + s1;
  613. aData += 2;
  614. }while( aData<aEnd );
  615. }
  616. aOut[0] = s1;
  617. aOut[1] = s2;
  618. }
  619. static void walShmBarrier(Wal *pWal){
  620. if( pWal.exclusiveMode!=WAL_HEAPMEMORY_MODE ){
  621. sqlite3OsShmBarrier(pWal.pDbFd);
  622. }
  623. }
  624. /*
  625. ** Write the header information in pWal.hdr into the wal-index.
  626. **
  627. ** The checksum on pWal.hdr is updated before it is written.
  628. */
  629. static void walIndexWriteHdr(Wal *pWal){
  630. volatile WalIndexHdr *aHdr = walIndexHdr(pWal);
  631. const int nCksum = offsetof(WalIndexHdr, aCksum);
  632. Debug.Assert( pWal.writeLock );
  633. pWal.hdr.isInit = 1;
  634. pWal.hdr.iVersion = WALINDEX_MAX_VERSION;
  635. walChecksumBytes(1, (u8*)&pWal.hdr, nCksum, 0, pWal.hdr.aCksum);
  636. memcpy((void *)&aHdr[1], (void *)&pWal.hdr, sizeof(WalIndexHdr));
  637. walShmBarrier(pWal);
  638. memcpy((void *)&aHdr[0], (void *)&pWal.hdr, sizeof(WalIndexHdr));
  639. }
  640. /*
  641. ** This function encodes a single frame header and writes it to a buffer
  642. ** supplied by the caller. A frame-header is made up of a series of
  643. ** 4-byte big-endian integers, as follows:
  644. **
  645. ** 0: Page number.
  646. ** 4: For commit records, the size of the database image in pages
  647. ** after the commit. For all other records, zero.
  648. ** 8: Salt-1 (copied from the wal-header)
  649. ** 12: Salt-2 (copied from the wal-header)
  650. ** 16: Checksum-1.
  651. ** 20: Checksum-2.
  652. */
  653. static void walEncodeFrame(
  654. Wal *pWal, /* The write-ahead log */
  655. u32 iPage, /* Database page number for frame */
  656. u32 nTruncate, /* New db size (or 0 for non-commit frames) */
  657. u8 *aData, /* Pointer to page data */
  658. u8 *aFrame /* OUT: Write encoded frame here */
  659. ){
  660. int nativeCksum; /* True for native byte-order checksums */
  661. u32 *aCksum = pWal.hdr.aFrameCksum;
  662. Debug.Assert( WAL_FRAME_HDRSIZE==24 );
  663. sqlite3Put4byte(&aFrame[0], iPage);
  664. sqlite3Put4byte(&aFrame[4], nTruncate);
  665. memcpy(&aFrame[8], pWal.hdr.aSalt, 8);
  666. nativeCksum = (pWal.hdr.bigEndCksum==SQLITE_BIGENDIAN);
  667. walChecksumBytes(nativeCksum, aFrame, 8, aCksum, aCksum);
  668. walChecksumBytes(nativeCksum, aData, pWal.szPage, aCksum, aCksum);
  669. sqlite3Put4byte(&aFrame[16], aCksum[0]);
  670. sqlite3Put4byte(&aFrame[20], aCksum[1]);
  671. }
  672. /*
  673. ** Check to see if the frame with header in aFrame[] and content
  674. ** in aData[] is valid. If it is a valid frame, fill *piPage and
  675. ** *pnTruncate and return true. Return if the frame is not valid.
  676. */
  677. static int walDecodeFrame(
  678. Wal *pWal, /* The write-ahead log */
  679. u32 *piPage, /* OUT: Database page number for frame */
  680. u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
  681. u8 *aData, /* Pointer to page data (for checksum) */
  682. u8 *aFrame /* Frame data */
  683. ){
  684. int nativeCksum; /* True for native byte-order checksums */
  685. u32 *aCksum = pWal.hdr.aFrameCksum;
  686. u32 pgno; /* Page number of the frame */
  687. Debug.Assert( WAL_FRAME_HDRSIZE==24 );
  688. /* A frame is only valid if the salt values in the frame-header
  689. ** match the salt values in the wal-header.
  690. */
  691. if( memcmp(&pWal.hdr.aSalt, &aFrame[8], 8)!=0 ){
  692. return 0;
  693. }
  694. /* A frame is only valid if the page number is creater than zero.
  695. */
  696. pgno = sqlite3Get4byte(&aFrame[0]);
  697. if( pgno==0 ){
  698. return 0;
  699. }
  700. /* A frame is only valid if a checksum of the WAL header,
  701. ** all prior frams, the first 16 bytes of this frame-header,
  702. ** and the frame-data matches the checksum in the last 8
  703. ** bytes of this frame-header.
  704. */
  705. nativeCksum = (pWal.hdr.bigEndCksum==SQLITE_BIGENDIAN);
  706. walChecksumBytes(nativeCksum, aFrame, 8, aCksum, aCksum);
  707. walChecksumBytes(nativeCksum, aData, pWal.szPage, aCksum, aCksum);
  708. if( aCksum[0]!=sqlite3Get4byte(&aFrame[16])
  709. || aCksum[1]!=sqlite3Get4byte(&aFrame[20])
  710. ){
  711. /* Checksum failed. */
  712. return 0;
  713. }
  714. /* If we reach this point, the frame is valid. Return the page number
  715. ** and the new database size.
  716. */
  717. *piPage = pgno;
  718. *pnTruncate = sqlite3Get4byte(&aFrame[4]);
  719. return 1;
  720. }
  721. #if (SQLITE_TEST) && (SQLITE_DEBUG)
  722. /*
  723. ** Names of locks. This routine is used to provide debugging output and is not
  724. ** a part of an ordinary build.
  725. */
  726. static const char *walLockName(int lockIdx){
  727. if( lockIdx==WAL_WRITE_LOCK ){
  728. return "WRITE-LOCK";
  729. }else if( lockIdx==WAL_CKPT_LOCK ){
  730. return "CKPT-LOCK";
  731. }else if( lockIdx==WAL_RECOVER_LOCK ){
  732. return "RECOVER-LOCK";
  733. }else{
  734. static char zName[15];
  735. sqlite3_snprintf(sizeof(zName), zName, "READ-LOCK[%d]",
  736. lockIdx-WAL_READ_LOCK(0));
  737. return zName;
  738. }
  739. }
  740. #endif //*defined(SQLITE_TEST) || defined(SQLITE_DEBUG) */
  741. /*
  742. ** Set or release locks on the WAL. Locks are either shared or exclusive.
  743. ** A lock cannot be moved directly between shared and exclusive - it must go
  744. ** through the unlocked state first.
  745. **
  746. ** In locking_mode=EXCLUSIVE, all of these routines become no-ops.
  747. */
  748. static int walLockShared(Wal *pWal, int lockIdx){
  749. int rc;
  750. if( pWal.exclusiveMode ) return SQLITE_OK;
  751. rc = sqlite3OsShmLock(pWal.pDbFd, lockIdx, 1,
  752. SQLITE_SHM_LOCK | SQLITE_SHM_SHARED);
  753. WALTRACE(("WAL%p: acquire SHARED-%s %s\n", pWal,
  754. walLockName(lockIdx), rc ? "failed" : "ok"));
  755. VVA_ONLY( pWal.lockError = (u8)(rc!=SQLITE_OK && rc!=SQLITE_BUSY); )
  756. return rc;
  757. }
  758. static void walUnlockShared(Wal *pWal, int lockIdx){
  759. if( pWal.exclusiveMode ) return;
  760. (void)sqlite3OsShmLock(pWal.pDbFd, lockIdx, 1,
  761. SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED);
  762. WALTRACE(("WAL%p: release SHARED-%s\n", pWal, walLockName(lockIdx)));
  763. }
  764. static int walLockExclusive(Wal *pWal, int lockIdx, int n){
  765. int rc;
  766. if( pWal.exclusiveMode ) return SQLITE_OK;
  767. rc = sqlite3OsShmLock(pWal.pDbFd, lockIdx, n,
  768. SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE);
  769. WALTRACE(("WAL%p: acquire EXCLUSIVE-%s cnt=%d %s\n", pWal,
  770. walLockName(lockIdx), n, rc ? "failed" : "ok"));
  771. VVA_ONLY( pWal.lockError = (u8)(rc!=SQLITE_OK && rc!=SQLITE_BUSY); )
  772. return rc;
  773. }
  774. static void walUnlockExclusive(Wal *pWal, int lockIdx, int n){
  775. if( pWal.exclusiveMode ) return;
  776. (void)sqlite3OsShmLock(pWal.pDbFd, lockIdx, n,
  777. SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE);
  778. WALTRACE(("WAL%p: release EXCLUSIVE-%s cnt=%d\n", pWal,
  779. walLockName(lockIdx), n));
  780. }
  781. /*
  782. ** Compute a hash on a page number. The resulting hash value must land
  783. ** between 0 and (HASHTABLE_NSLOT-1). The walHashNext() function advances
  784. ** the hash to the next value in the event of a collision.
  785. */
  786. static int walHash(u32 iPage){
  787. Debug.Assert( iPage>0 );
  788. Debug.Assert( (HASHTABLE_NSLOT & (HASHTABLE_NSLOT-1))==0 );
  789. return (iPage*HASHTABLE_HASH_1) & (HASHTABLE_NSLOT-1);
  790. }
  791. static int walNextHash(int iPriorHash){
  792. return (iPriorHash+1)&(HASHTABLE_NSLOT-1);
  793. }
  794. /*
  795. ** Return pointers to the hash table and page number array stored on
  796. ** page iHash of the wal-index. The wal-index is broken into 32KB pages
  797. ** numbered starting from 0.
  798. **
  799. ** Set output variable *paHash to point to the start of the hash table
  800. ** in the wal-index file. Set *piZero to one less than the frame
  801. ** number of the first frame indexed by this hash table. If a
  802. ** slot in the hash table is set to N, it refers to frame number
  803. ** (*piZero+N) in the log.
  804. **
  805. ** Finally, set *paPgno so that *paPgno[1] is the page number of the
  806. ** first frame indexed by the hash table, frame (*piZero+1).
  807. */
  808. static int walHashGet(
  809. Wal *pWal, /* WAL handle */
  810. int iHash, /* Find the iHash'th table */
  811. volatile ht_slot **paHash, /* OUT: Pointer to hash index */
  812. volatile u32 **paPgno, /* OUT: Pointer to page number array */
  813. u32 *piZero /* OUT: Frame associated with *paPgno[0] */
  814. ){
  815. int rc; /* Return code */
  816. volatile u32 *aPgno;
  817. rc = walIndexPage(pWal, iHash, &aPgno);
  818. Debug.Assert( rc==SQLITE_OK || iHash>0 );
  819. if( rc==SQLITE_OK ){
  820. u32 iZero;
  821. volatile ht_slot *aHash;
  822. aHash = (volatile ht_slot *)&aPgno[HASHTABLE_NPAGE];
  823. if( iHash==0 ){
  824. aPgno = &aPgno[WALINDEX_HDR_SIZE/sizeof(u32)];
  825. iZero = 0;
  826. }else{
  827. iZero = HASHTABLE_NPAGE_ONE + (iHash-1)*HASHTABLE_NPAGE;
  828. }
  829. *paPgno = &aPgno[-1];
  830. *paHash = aHash;
  831. *piZero = iZero;
  832. }
  833. return rc;
  834. }
  835. /*
  836. ** Return the number of the wal-index page that contains the hash-table
  837. ** and page-number array that contain entries corresponding to WAL frame
  838. ** iFrame. The wal-index is broken up into 32KB pages. Wal-index pages
  839. ** are numbered starting from 0.
  840. */
  841. static int walFramePage(u32 iFrame){
  842. int iHash = (iFrame+HASHTABLE_NPAGE-HASHTABLE_NPAGE_ONE-1) / HASHTABLE_NPAGE;
  843. Debug.Assert( (iHash==0 || iFrame>HASHTABLE_NPAGE_ONE)
  844. && (iHash>=1 || iFrame<=HASHTABLE_NPAGE_ONE)
  845. && (iHash<=1 || iFrame>(HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE))
  846. && (iHash>=2 || iFrame<=HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE)
  847. && (iHash<=2 || iFrame>(HASHTABLE_NPAGE_ONE+2*HASHTABLE_NPAGE))
  848. );
  849. return iHash;
  850. }
  851. /*
  852. ** Return the page number associated with frame iFrame in this WAL.
  853. */
  854. static u32 walFramePgno(Wal *pWal, u32 iFrame){
  855. int iHash = walFramePage(iFrame);
  856. if( iHash==0 ){
  857. return pWal.apWiData[0][WALINDEX_HDR_SIZE/sizeof(u32) + iFrame - 1];
  858. }
  859. return pWal.apWiData[iHash][(iFrame-1-HASHTABLE_NPAGE_ONE)%HASHTABLE_NPAGE];
  860. }
  861. /*
  862. ** Remove entries from the hash table that point to WAL slots greater
  863. ** than pWal.hdr.mxFrame.
  864. **
  865. ** This function is called whenever pWal.hdr.mxFrame is decreased due
  866. ** to a rollback or savepoint.
  867. **
  868. ** At most only the hash table containing pWal.hdr.mxFrame needs to be
  869. ** updated. Any later hash tables will be automatically cleared when
  870. ** pWal.hdr.mxFrame advances to the point where those hash tables are
  871. ** actually needed.
  872. */
  873. static void walCleanupHash(Wal *pWal){
  874. volatile ht_slot *aHash = 0; /* Pointer to hash table to clear */
  875. volatile u32 *aPgno = 0; /* Page number array for hash table */
  876. u32 iZero = 0; /* frame == (aHash[x]+iZero) */
  877. int iLimit = 0; /* Zero values greater than this */
  878. int nByte; /* Number of bytes to zero in aPgno[] */
  879. int i; /* Used to iterate through aHash[] */
  880. Debug.Assert( pWal.writeLock );
  881. testcase( pWal.hdr.mxFrame==HASHTABLE_NPAGE_ONE-1 );
  882. testcase( pWal.hdr.mxFrame==HASHTABLE_NPAGE_ONE );
  883. testcase( pWal.hdr.mxFrame==HASHTABLE_NPAGE_ONE+1 );
  884. if( pWal.hdr.mxFrame==0 ) return;
  885. /* Obtain pointers to the hash-table and page-number array containing
  886. ** the entry that corresponds to frame pWal.hdr.mxFrame. It is guaranteed
  887. ** that the page said hash-table and array reside on is already mapped.
  888. */
  889. Debug.Assert( pWal.nWiData>walFramePage(pWal.hdr.mxFrame) );
  890. Debug.Assert( pWal.apWiData[walFramePage(pWal.hdr.mxFrame)] );
  891. walHashGet(pWal, walFramePage(pWal.hdr.mxFrame), &aHash, &aPgno, &iZero);
  892. /* Zero all hash-table entries that correspond to frame numbers greater
  893. ** than pWal.hdr.mxFrame.
  894. */
  895. iLimit = pWal.hdr.mxFrame - iZero;
  896. Debug.Assert( iLimit>0 );
  897. for(i=0; i<HASHTABLE_NSLOT; i++){
  898. if( aHash[i]>iLimit ){
  899. aHash[i] = 0;
  900. }
  901. }
  902. /* Zero the entries in the aPgno array that correspond to frames with
  903. ** frame numbers greater than pWal.hdr.mxFrame.
  904. */
  905. nByte = (int)((char *)aHash - (char *)&aPgno[iLimit+1]);
  906. memset((void *)&aPgno[iLimit+1], 0, nByte);
  907. #if SQLITE_ENABLE_EXPENSIVE_ASSERT
  908. /* Verify that the every entry in the mapping region is still reachable
  909. ** via the hash table even after the cleanup.
  910. */
  911. if( iLimit ){
  912. int i; /* Loop counter */
  913. int iKey; /* Hash key */
  914. for(i=1; i<=iLimit; i++){
  915. for(iKey=walHash(aPgno[i]); aHash[iKey]; iKey=walNextHash(iKey)){
  916. if( aHash[iKey]==i ) break;
  917. }
  918. Debug.Assert( aHash[iKey]==i );
  919. }
  920. }
  921. #endif //* SQLITE_ENABLE_EXPENSIVE_ASSERT */
  922. }
  923. /*
  924. ** Set an entry in the wal-index that will map database page number
  925. ** pPage into WAL frame iFrame.
  926. */
  927. static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
  928. int rc; /* Return code */
  929. u32 iZero = 0; /* One less than frame number of aPgno[1] */
  930. volatile u32 *aPgno = 0; /* Page number array */
  931. volatile ht_slot *aHash = 0; /* Hash table */
  932. rc = walHashGet(pWal, walFramePage(iFrame), &aHash, &aPgno, &iZero);
  933. /* Assuming the wal-index file was successfully mapped, populate the
  934. ** page number array and hash table entry.
  935. */
  936. if( rc==SQLITE_OK ){
  937. int iKey; /* Hash table key */
  938. int idx; /* Value to write to hash-table slot */
  939. int nCollide; /* Number of hash collisions */
  940. idx = iFrame - iZero;
  941. Debug.Assert( idx <= HASHTABLE_NSLOT/2 + 1 );
  942. /* If this is the first entry to be added to this hash-table, zero the
  943. ** entire hash table and aPgno[] array before proceding.
  944. */
  945. if( idx==1 ){
  946. int nByte = (int)((u8 *)&aHash[HASHTABLE_NSLOT] - (u8 *)&aPgno[1]);
  947. memset((void*)&aPgno[1], 0, nByte);
  948. }
  949. /* If the entry in aPgno[] is already set, then the previous writer
  950. ** must have exited unexpectedly in the middle of a transaction (after
  951. ** writing one or more dirty pages to the WAL to free up memory).
  952. ** Remove the remnants of that writers uncommitted transaction from
  953. ** the hash-table before writing any new entries.
  954. */
  955. if( aPgno[idx] ){
  956. walCleanupHash(pWal);
  957. Debug.Assert( !aPgno[idx] );
  958. }
  959. /* Write the aPgno[] array entry and the hash-table slot. */
  960. nCollide = idx;
  961. for(iKey=walHash(iPage); aHash[iKey]; iKey=walNextHash(iKey)){
  962. if( (nCollide--)==0 ) return SQLITE_CORRUPT_BKPT;
  963. }
  964. aPgno[idx] = iPage;
  965. aHash[iKey] = (ht_slot)idx;
  966. #if SQLITE_ENABLE_EXPENSIVE_ASSERT
  967. /* Verify that the number of entries in the hash table exactly equals
  968. ** the number of entries in the mapping region.
  969. */
  970. {
  971. int i; /* Loop counter */
  972. int nEntry = 0; /* Number of entries in the hash table */
  973. for(i=0; i<HASHTABLE_NSLOT; i++){ if( aHash[i] ) nEntry++; }
  974. Debug.Assert( nEntry==idx );
  975. }
  976. /* Verify that the every entry in the mapping region is reachable
  977. ** via the hash table. This turns out to be a really, really expensive
  978. ** thing to check, so only do this occasionally - not on every
  979. ** iteration.
  980. */
  981. if( (idx&0x3ff)==0 ){
  982. int i; /* Loop counter */
  983. for(i=1; i<=idx; i++){
  984. for(iKey=walHash(aPgno[i]); aHash[iKey]; iKey=walNextHash(iKey)){
  985. if( aHash[iKey]==i ) break;
  986. }
  987. Debug.Assert( aHash[iKey]==i );
  988. }
  989. }
  990. #endif //* SQLITE_ENABLE_EXPENSIVE_ASSERT */
  991. }
  992. return rc;
  993. }
  994. /*
  995. ** Recover the wal-index by reading the write-ahead log file.
  996. **
  997. ** This routine first tries to establish an exclusive lock on the
  998. ** wal-index to prevent other threads/processes from doing anything
  999. ** with the WAL or wal-index while recovery is running. The
  1000. ** WAL_RECOVER_LOCK is also held so that other threads will know
  1001. ** that this thread is running recovery. If unable to establish
  1002. ** the necessary locks, this routine returns SQLITE_BUSY.
  1003. */
  1004. static int walIndexRecover(Wal *pWal){
  1005. int rc; /* Return Code */
  1006. i64 nSize; /* Size of log file */
  1007. u32 aFrameCksum[2] = {0, 0};
  1008. int iLock; /* Lock offset to lock for checkpoint */
  1009. int nLock; /* Number of locks to hold */
  1010. /* Obtain an exclusive lock on all byte in the locking range not already
  1011. ** locked by the caller. The caller is guaranteed to have locked the
  1012. ** WAL_WRITE_LOCK byte, and may have also locked the WAL_CKPT_LOCK byte.
  1013. ** If successful, the same bytes that are locked here are unlocked before
  1014. ** this function returns.
  1015. */
  1016. Debug.Assert( pWal.ckptLock==1 || pWal.ckptLock==0 );
  1017. Debug.Assert( WAL_ALL_BUT_WRITE==WAL_WRITE_LOCK+1 );
  1018. Debug.Assert( WAL_CKPT_LOCK==WAL_ALL_BUT_WRITE );
  1019. Debug.Assert( pWal.writeLock );
  1020. iLock = WAL_ALL_BUT_WRITE + pWal.ckptLock;
  1021. nLock = SQLITE_SHM_NLOCK - iLock;
  1022. rc = walLockExclusive(pWal, iLock, nLock);
  1023. if( rc ){
  1024. return rc;
  1025. }
  1026. WALTRACE(("WAL%p: recovery begin...\n", pWal));
  1027. memset(&pWal.hdr, 0, sizeof(WalIndexHdr));
  1028. rc = sqlite3OsFileSize(pWal.pWalFd, &nSize);
  1029. if( rc!=SQLITE_OK ){
  1030. goto recovery_error;
  1031. }
  1032. if( nSize>WAL_HDRSIZE ){
  1033. u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */
  1034. u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
  1035. int szFrame; /* Number of bytes in buffer aFrame[] */
  1036. u8 *aData; /* Pointer to data part of aFrame buffer */
  1037. int iFrame; /* Index of last frame read */
  1038. i64 iOffset; /* Next offset to read from log file */
  1039. int szPage; /* Page size according to the log */
  1040. u32 magic; /* Magic value read from WAL header */
  1041. u32 version; /* Magic value read from WAL header */
  1042. /* Read in the WAL header. */
  1043. rc = sqlite3OsRead(pWal.pWalFd, aBuf, WAL_HDRSIZE, 0);
  1044. if( rc!=SQLITE_OK ){
  1045. goto recovery_error;
  1046. }
  1047. /* If the database page size is not a power of two, or is greater than
  1048. ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid
  1049. ** data. Similarly, if the 'magic' value is invalid, ignore the whole
  1050. ** WAL file.
  1051. */
  1052. magic = sqlite3Get4byte(&aBuf[0]);
  1053. szPage = sqlite3Get4byte(&aBuf[8]);
  1054. if( (magic&0xFFFFFFFE)!=WAL_MAGIC
  1055. || szPage&(szPage-1)
  1056. || szPage>SQLITE_MAX_PAGE_SIZE
  1057. || szPage<512
  1058. ){
  1059. goto finished;
  1060. }
  1061. pWal.hdr.bigEndCksum = (u8)(magic&0x00000001);
  1062. pWal.szPage = (u16)szPage;
  1063. pWal.nCkpt = sqlite3Get4byte(&aBuf[12]);
  1064. memcpy(&pWal.hdr.aSalt, &aBuf[16], 8);
  1065. /* Verify that the WAL header checksum is correct */
  1066. walChecksumBytes(pWal.hdr.bigEndCksum==SQLITE_BIGENDIAN,
  1067. aBuf, WAL_HDRSIZE-2*4, 0, pWal.hdr.aFrameCksum
  1068. );
  1069. if( pWal.hdr.aFrameCksum[0]!=sqlite3Get4byte(&aBuf[24])
  1070. || pWal.hdr.aFrameCksum[1]!=sqlite3Get4byte(&aBuf[28])
  1071. ){
  1072. goto finished;
  1073. }
  1074. /* Verify that the version number on the WAL format is one that
  1075. ** are able to understand */
  1076. version = sqlite3Get4byte(&aBuf[4]);
  1077. if( version!=WAL_MAX_VERSION ){
  1078. rc = SQLITE_CANTOPEN_BKPT;
  1079. goto finished;
  1080. }
  1081. /* Malloc a buffer to read frames into. */
  1082. szFrame = szPage + WAL_FRAME_HDRSIZE;
  1083. aFrame = (u8 *)sqlite3_malloc(szFrame);
  1084. if( !aFrame ){
  1085. rc = SQLITE_NOMEM;
  1086. goto recovery_error;
  1087. }
  1088. aData = &aFrame[WAL_FRAME_HDRSIZE];
  1089. /* Read all frames from the log file. */
  1090. iFrame = 0;
  1091. for(iOffset=WAL_HDRSIZE; (iOffset+szFrame)<=nSize; iOffset+=szFrame){
  1092. u32 pgno; /* Database page number for frame */
  1093. u32 nTruncate; /* dbsize field from frame header */
  1094. int isValid; /* True if this frame is valid */
  1095. /* Read and decode the next log frame. */
  1096. rc = sqlite3OsRead(pWal.pWalFd, aFrame, szFrame, iOffset);
  1097. if( rc!=SQLITE_OK ) break;
  1098. isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame);
  1099. if( !isValid ) break;
  1100. rc = walIndexAppend(pWal, ++iFrame, pgno);
  1101. if( rc!=SQLITE_OK ) break;
  1102. /* If nTruncate is non-zero, this is a commit record. */
  1103. if( nTruncate ){
  1104. pWal.hdr.mxFrame = iFrame;
  1105. pWal.hdr.nPage = nTruncate;
  1106. pWal.hdr.szPage = (u16)szPage;
  1107. aFrameCksum[0] = pWal.hdr.aFrameCksum[0];
  1108. aFrameCksum[1] = pWal.hdr.aFrameCksum[1];
  1109. }
  1110. }
  1111. sqlite3_free(aFrame);
  1112. }
  1113. finished:
  1114. if( rc==SQLITE_OK ){
  1115. volatile WalCkptInfo *pInfo;
  1116. int i;
  1117. pWal.hdr.aFrameCksum[0] = aFrameCksum[0];
  1118. pWal.hdr.aFrameCksum[1] = aFrameCksum[1];
  1119. walIndexWriteHdr(pWal);
  1120. /* Reset the checkpoint-header. This is safe because this thread is
  1121. ** currently holding locks that exclude all other readers, writers and
  1122. ** checkpointers.
  1123. */
  1124. pInfo = walCkptInfo(pWal);
  1125. pInfo.nBackfill = 0;
  1126. pInfo.aReadMark[0] = 0;
  1127. for(i=1; i<WAL_NREADER; i++) pInfo.aReadMark[i] = READMARK_NOT_USED;
  1128. }
  1129. recovery_error:
  1130. WALTRACE(("WAL%p: recovery %s\n", pWal, rc ? "failed" : "ok"));
  1131. walUnlockExclusive(pWal, iLock, nLock);
  1132. return rc;
  1133. }
  1134. /*
  1135. ** Close an open wal-index.
  1136. */
  1137. sta

Large files files are truncated, but you can click here to view the full file