PageRenderTime 59ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 1ms

/library/Library/WP7/SQLiteDriver/sqlite/wal_c.cs

https://bitbucket.org/digitalizarte/coolstorage
C# | 2751 lines | 1384 code | 225 blank | 1142 comment | 244 complexity | c71f94acbce0b78bc9e2a56f8e0c2e0a MD5 | raw file
  1. using System;
  2. using System.Diagnostics;
  3. using System.Text;
  4. using Bitmask = System.UInt64;
  5. using u32 = System.UInt32;
  6. namespace Community.CsharpSqlite
  7. {
  8. public partial class Sqlite3
  9. {
  10. /*
  11. ** 2010 February 1
  12. **
  13. ** The author disclaims copyright to this source code. In place of
  14. ** a legal notice, here is a blessing:
  15. **
  16. ** May you do good and not evil.
  17. ** May you find forgiveness for yourself and forgive others.
  18. ** May you share freely, never taking more than you give.
  19. **
  20. *************************************************************************
  21. **
  22. ** This file contains the implementation of a write-ahead log (WAL) used in
  23. ** "journal_mode=WAL" mode.
  24. **
  25. ** WRITE-AHEAD LOG (WAL) FILE FORMAT
  26. **
  27. ** A WAL file consists of a header followed by zero or more "frames".
  28. ** Each frame records the revised content of a single page from the
  29. ** database file. All changes to the database are recorded by writing
  30. ** frames into the WAL. Transactions commit when a frame is written that
  31. ** contains a commit marker. A single WAL can and usually does record
  32. ** multiple transactions. Periodically, the content of the WAL is
  33. ** transferred back into the database file in an operation called a
  34. ** "checkpoint".
  35. **
  36. ** A single WAL file can be used multiple times. In other words, the
  37. ** WAL can fill up with frames and then be checkpointed and then new
  38. ** frames can overwrite the old ones. A WAL always grows from beginning
  39. ** toward the end. Checksums and counters attached to each frame are
  40. ** used to determine which frames within the WAL are valid and which
  41. ** are leftovers from prior checkpoints.
  42. **
  43. ** The WAL header is 32 bytes in size and consists of the following eight
  44. ** big-endian 32-bit unsigned integer values:
  45. **
  46. ** 0: Magic number. 0x377f0682 or 0x377f0683
  47. ** 4: File format version. Currently 3007000
  48. ** 8: Database page size. Example: 1024
  49. ** 12: Checkpoint sequence number
  50. ** 16: Salt-1, random integer incremented with each checkpoint
  51. ** 20: Salt-2, a different random integer changing with each ckpt
  52. ** 24: Checksum-1 (first part of checksum for first 24 bytes of header).
  53. ** 28: Checksum-2 (second part of checksum for first 24 bytes of header).
  54. **
  55. ** Immediately following the wal-header are zero or more frames. Each
  56. ** frame consists of a 24-byte frame-header followed by a <page-size> bytes
  57. ** of page data. The frame-header is six big-endian 32-bit unsigned
  58. ** integer values, as follows:
  59. **
  60. ** 0: Page number.
  61. ** 4: For commit records, the size of the database image in pages
  62. ** after the commit. For all other records, zero.
  63. ** 8: Salt-1 (copied from the header)
  64. ** 12: Salt-2 (copied from the header)
  65. ** 16: Checksum-1.
  66. ** 20: Checksum-2.
  67. **
  68. ** A frame is considered valid if and only if the following conditions are
  69. ** true:
  70. **
  71. ** (1) The salt-1 and salt-2 values in the frame-header match
  72. ** salt values in the wal-header
  73. **
  74. ** (2) The checksum values in the final 8 bytes of the frame-header
  75. ** exactly match the checksum computed consecutively on the
  76. ** WAL header and the first 8 bytes and the content of all frames
  77. ** up to and including the current frame.
  78. **
  79. ** The checksum is computed using 32-bit big-endian integers if the
  80. ** magic number in the first 4 bytes of the WAL is 0x377f0683 and it
  81. ** is computed using little-endian if the magic number is 0x377f0682.
  82. ** The checksum values are always stored in the frame header in a
  83. ** big-endian format regardless of which byte order is used to compute
  84. ** the checksum. The checksum is computed by interpreting the input as
  85. ** an even number of unsigned 32-bit integers: x[0] through x[N]. The
  86. ** algorithm used for the checksum is as follows:
  87. **
  88. ** for i from 0 to n-1 step 2:
  89. ** s0 += x[i] + s1;
  90. ** s1 += x[i+1] + s0;
  91. ** endfor
  92. **
  93. ** Note that s0 and s1 are both weighted checksums using fibonacci weights
  94. ** in reverse order (the largest fibonacci weight occurs on the first element
  95. ** of the sequence being summed.) The s1 value spans all 32-bit
  96. ** terms of the sequence whereas s0 omits the final term.
  97. **
  98. ** On a checkpoint, the WAL is first VFS.xSync-ed, then valid content of the
  99. ** WAL is transferred into the database, then the database is VFS.xSync-ed.
  100. ** The VFS.xSync operations serve as write barriers - all writes launched
  101. ** before the xSync must complete before any write that launches after the
  102. ** xSync begins.
  103. **
  104. ** After each checkpoint, the salt-1 value is incremented and the salt-2
  105. ** value is randomized. This prevents old and new frames in the WAL from
  106. ** being considered valid at the same time and being checkpointing together
  107. ** following a crash.
  108. **
  109. ** READER ALGORITHM
  110. **
  111. ** To read a page from the database (call it page number P), a reader
  112. ** first checks the WAL to see if it contains page P. If so, then the
  113. ** last valid instance of page P that is a followed by a commit frame
  114. ** or is a commit frame itself becomes the value read. If the WAL
  115. ** contains no copies of page P that are valid and which are a commit
  116. ** frame or are followed by a commit frame, then page P is read from
  117. ** the database file.
  118. **
  119. ** To start a read transaction, the reader records the index of the last
  120. ** valid frame in the WAL. The reader uses this recorded "mxFrame" value
  121. ** for all subsequent read operations. New transactions can be appended
  122. ** to the WAL, but as long as the reader uses its original mxFrame value
  123. ** and ignores the newly appended content, it will see a consistent snapshot
  124. ** of the database from a single point in time. This technique allows
  125. ** multiple concurrent readers to view different versions of the database
  126. ** content simultaneously.
  127. **
  128. ** The reader algorithm in the previous paragraphs works correctly, but
  129. ** because frames for page P can appear anywhere within the WAL, the
  130. ** reader has to scan the entire WAL looking for page P frames. If the
  131. ** WAL is large (multiple megabytes is typical) that scan can be slow,
  132. ** and read performance suffers. To overcome this problem, a separate
  133. ** data structure called the wal-index is maintained to expedite the
  134. ** search for frames of a particular page.
  135. **
  136. ** WAL-INDEX FORMAT
  137. **
  138. ** Conceptually, the wal-index is shared memory, though VFS implementations
  139. ** might choose to implement the wal-index using a mmapped file. Because
  140. ** the wal-index is shared memory, SQLite does not support journal_mode=WAL
  141. ** on a network filesystem. All users of the database must be able to
  142. ** share memory.
  143. **
  144. ** The wal-index is transient. After a crash, the wal-index can (and should
  145. ** be) reconstructed from the original WAL file. In fact, the VFS is required
  146. ** to either truncate or zero the header of the wal-index when the last
  147. ** connection to it closes. Because the wal-index is transient, it can
  148. ** use an architecture-specific format; it does not have to be cross-platform.
  149. ** Hence, unlike the database and WAL file formats which store all values
  150. ** as big endian, the wal-index can store multi-byte values in the native
  151. ** byte order of the host computer.
  152. **
  153. ** The purpose of the wal-index is to answer this question quickly: Given
  154. ** a page number P, return the index of the last frame for page P in the WAL,
  155. ** or return NULL if there are no frames for page P in the WAL.
  156. **
  157. ** The wal-index consists of a header region, followed by an one or
  158. ** more index blocks.
  159. **
  160. ** The wal-index header contains the total number of frames within the WAL
  161. ** in the the mxFrame field.
  162. **
  163. ** Each index block except for the first contains information on
  164. ** HASHTABLE_NPAGE frames. The first index block contains information on
  165. ** HASHTABLE_NPAGE_ONE frames. The values of HASHTABLE_NPAGE_ONE and
  166. ** HASHTABLE_NPAGE are selected so that together the wal-index header and
  167. ** first index block are the same size as all other index blocks in the
  168. ** wal-index.
  169. **
  170. ** Each index block contains two sections, a page-mapping that contains the
  171. ** database page number associated with each wal frame, and a hash-table
  172. ** that allows readers to query an index block for a specific page number.
  173. ** The page-mapping is an array of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE
  174. ** for the first index block) 32-bit page numbers. The first entry in the
  175. ** first index-block contains the database page number corresponding to the
  176. ** first frame in the WAL file. The first entry in the second index block
  177. ** in the WAL file corresponds to the (HASHTABLE_NPAGE_ONE+1)th frame in
  178. ** the log, and so on.
  179. **
  180. ** The last index block in a wal-index usually contains less than the full
  181. ** complement of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE) page-numbers,
  182. ** depending on the contents of the WAL file. This does not change the
  183. ** allocated size of the page-mapping array - the page-mapping array merely
  184. ** contains unused entries.
  185. **
  186. ** Even without using the hash table, the last frame for page P
  187. ** can be found by scanning the page-mapping sections of each index block
  188. ** starting with the last index block and moving toward the first, and
  189. ** within each index block, starting at the end and moving toward the
  190. ** beginning. The first entry that equals P corresponds to the frame
  191. ** holding the content for that page.
  192. **
  193. ** The hash table consists of HASHTABLE_NSLOT 16-bit unsigned integers.
  194. ** HASHTABLE_NSLOT = 2*HASHTABLE_NPAGE, and there is one entry in the
  195. ** hash table for each page number in the mapping section, so the hash
  196. ** table is never more than half full. The expected number of collisions
  197. ** prior to finding a match is 1. Each entry of the hash table is an
  198. ** 1-based index of an entry in the mapping section of the same
  199. ** index block. Let K be the 1-based index of the largest entry in
  200. ** the mapping section. (For index blocks other than the last, K will
  201. ** always be exactly HASHTABLE_NPAGE (4096) and for the last index block
  202. ** K will be (mxFrame%HASHTABLE_NPAGE).) Unused slots of the hash table
  203. ** contain a value of 0.
  204. **
  205. ** To look for page P in the hash table, first compute a hash iKey on
  206. ** P as follows:
  207. **
  208. ** iKey = (P * 383) % HASHTABLE_NSLOT
  209. **
  210. ** Then start scanning entries of the hash table, starting with iKey
  211. ** (wrapping around to the beginning when the end of the hash table is
  212. ** reached) until an unused hash slot is found. Let the first unused slot
  213. ** be at index iUnused. (iUnused might be less than iKey if there was
  214. ** wrap-around.) Because the hash table is never more than half full,
  215. ** the search is guaranteed to eventually hit an unused entry. Let
  216. ** iMax be the value between iKey and iUnused, closest to iUnused,
  217. ** where aHash[iMax]==P. If there is no iMax entry (if there exists
  218. ** no hash slot such that aHash[i]==p) then page P is not in the
  219. ** current index block. Otherwise the iMax-th mapping entry of the
  220. ** current index block corresponds to the last entry that references
  221. ** page P.
  222. **
  223. ** A hash search begins with the last index block and moves toward the
  224. ** first index block, looking for entries corresponding to page P. On
  225. ** average, only two or three slots in each index block need to be
  226. ** examined in order to either find the last entry for page P, or to
  227. ** establish that no such entry exists in the block. Each index block
  228. ** holds over 4000 entries. So two or three index blocks are sufficient
  229. ** to cover a typical 10 megabyte WAL file, assuming 1K pages. 8 or 10
  230. ** comparisons (on average) suffice to either locate a frame in the
  231. ** WAL or to establish that the frame does not exist in the WAL. This
  232. ** is much faster than scanning the entire 10MB WAL.
  233. **
  234. ** Note that entries are added in order of increasing K. Hence, one
  235. ** reader might be using some value K0 and a second reader that started
  236. ** at a later time (after additional transactions were added to the WAL
  237. ** and to the wal-index) might be using a different value K1, where K1>K0.
  238. ** Both readers can use the same hash table and mapping section to get
  239. ** the correct result. There may be entries in the hash table with
  240. ** K>K0 but to the first reader, those entries will appear to be unused
  241. ** slots in the hash table and so the first reader will get an answer as
  242. ** if no values greater than K0 had ever been inserted into the hash table
  243. ** in the first place - which is what reader one wants. Meanwhile, the
  244. ** second reader using K1 will see additional values that were inserted
  245. ** later, which is exactly what reader two wants.
  246. **
  247. ** When a rollback occurs, the value of K is decreased. Hash table entries
  248. ** that correspond to frames greater than the new K value are removed
  249. ** from the hash table at this point.
  250. *************************************************************************
  251. ** Included in SQLite3 port to C#-SQLite; 2008 Noah B Hart
  252. ** C#-SQLite is an independent reimplementation of the SQLite software library
  253. **
  254. ** SQLITE_SOURCE_ID: 2010-12-07 20:14:09 a586a4deeb25330037a49df295b36aaf624d0f45
  255. **
  256. *************************************************************************
  257. */
  258. #if !SQLITE_OMIT_WAL
  259. //#include "wal.h"
  260. /*
  261. ** Trace output macros
  262. */
  263. #if (SQLITE_TEST) && (SQLITE_DEBUG)
  264. int sqlite3WalTrace = 0;
  265. //# define WALTRACE(X) if(sqlite3WalTrace) sqlite3DebugPrintf X
  266. static void WALTRACE(params object[] X)
  267. {
  268. if(sqlite3WalTrace) sqlite3DebugPrintf(X);
  269. }
  270. #else
  271. //# define WALTRACE(X)
  272. static void WALTRACE(params object[] X) {}
  273. #endif
  274. /*
  275. ** The maximum (and only) versions of the wal and wal-index formats
  276. ** that may be interpreted by this version of SQLite.
  277. **
  278. ** If a client begins recovering a WAL file and finds that (a) the checksum
  279. ** values in the wal-header are correct and (b) the version field is not
  280. ** WAL_MAX_VERSION, recovery fails and SQLite returns SQLITE_CANTOPEN.
  281. **
  282. ** Similarly, if a client successfully reads a wal-index header (i.e. the
  283. ** checksum test is successful) and finds that the version field is not
  284. ** WALINDEX_MAX_VERSION, then no read-transaction is opened and SQLite
  285. ** returns SQLITE_CANTOPEN.
  286. */
  287. //#define WAL_MAX_VERSION 3007000
  288. //#define WALINDEX_MAX_VERSION 3007000
  289. const int WAL_MAX_VERSION = 3007000;
  290. const int WALINDEX_MAX_VERSION = 3007000;
  291. /*
  292. ** Indices of various locking bytes. WAL_NREADER is the number
  293. ** of available reader locks and should be at least 3.
  294. */
  295. //#define WAL_WRITE_LOCK 0
  296. //#define WAL_ALL_BUT_WRITE 1
  297. //#define WAL_CKPT_LOCK 1
  298. //#define WAL_RECOVER_LOCK 2
  299. //#define WAL_READ_LOCK(I) (3+(I))
  300. //#define WAL_NREADER (SQLITE_SHM_NLOCK-3)
  301. const int WAL_WRITE_LOCK = 0;
  302. const int WAL_ALL_BUT_WRITE = 1;
  303. const int WAL_CKPT_LOCK = 1;
  304. const int WAL_RECOVER_LOCK = 2;
  305. const int WAL_READ_LOCK(I) = (3+(I));
  306. const int WAL_NREADER = (SQLITE_SHM_NLOCK-3);
  307. /* Object declarations */
  308. typedef struct WalIndexHdr WalIndexHdr;
  309. typedef struct WalIterator WalIterator;
  310. typedef struct WalCkptInfo WalCkptInfo;
  311. /*
  312. ** The following object holds a copy of the wal-index header content.
  313. **
  314. ** The actual header in the wal-index consists of two copies of this
  315. ** object.
  316. */
  317. struct WalIndexHdr {
  318. u32 iVersion; /* Wal-index version */
  319. u32 unused; /* Unused (padding) field */
  320. u32 iChange; /* Counter incremented each transaction */
  321. u8 isInit; /* 1 when initialized */
  322. u8 bigEndCksum; /* True if checksums in WAL are big-endian */
  323. u16 szPage; /* Database page size in bytes */
  324. u32 mxFrame; /* Index of last valid frame in the WAL */
  325. u32 nPage; /* Size of database in pages */
  326. u32 aFrameCksum[2]; /* Checksum of last frame in log */
  327. u32 aSalt[2]; /* Two salt values copied from WAL header */
  328. u32 aCksum[2]; /* Checksum over all prior fields */
  329. };
  330. /*
  331. ** A copy of the following object occurs in the wal-index immediately
  332. ** following the second copy of the WalIndexHdr. This object stores
  333. ** information used by checkpoint.
  334. **
  335. ** nBackfill is the number of frames in the WAL that have been written
  336. ** back into the database. (We call the act of moving content from WAL to
  337. ** database "backfilling".) The nBackfill number is never greater than
  338. ** WalIndexHdr.mxFrame. nBackfill can only be increased by threads
  339. ** holding the WAL_CKPT_LOCK lock (which includes a recovery thread).
  340. ** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from
  341. ** mxFrame back to zero when the WAL is reset.
  342. **
  343. ** There is one entry in aReadMark[] for each reader lock. If a reader
  344. ** holds read-lock K, then the value in aReadMark[K] is no greater than
  345. ** the mxFrame for that reader. The value READMARK_NOT_USED (0xffffffff)
  346. ** for any aReadMark[] means that entry is unused. aReadMark[0] is
  347. ** a special case; its value is never used and it exists as a place-holder
  348. ** to avoid having to offset aReadMark[] indexs by one. Readers holding
  349. ** WAL_READ_LOCK(0) always ignore the entire WAL and read all content
  350. ** directly from the database.
  351. **
  352. ** The value of aReadMark[K] may only be changed by a thread that
  353. ** is holding an exclusive lock on WAL_READ_LOCK(K). Thus, the value of
  354. ** aReadMark[K] cannot changed while there is a reader is using that mark
  355. ** since the reader will be holding a shared lock on WAL_READ_LOCK(K).
  356. **
  357. ** The checkpointer may only transfer frames from WAL to database where
  358. ** the frame numbers are less than or equal to every aReadMark[] that is
  359. ** in use (that is, every aReadMark[j] for which there is a corresponding
  360. ** WAL_READ_LOCK(j)). New readers (usually) pick the aReadMark[] with the
  361. ** largest value and will increase an unused aReadMark[] to mxFrame if there
  362. ** is not already an aReadMark[] equal to mxFrame. The exception to the
  363. ** previous sentence is when nBackfill equals mxFrame (meaning that everything
  364. ** in the WAL has been backfilled into the database) then new readers
  365. ** will choose aReadMark[0] which has value 0 and hence such reader will
  366. ** get all their all content directly from the database file and ignore
  367. ** the WAL.
  368. **
  369. ** Writers normally append new frames to the end of the WAL. However,
  370. ** if nBackfill equals mxFrame (meaning that all WAL content has been
  371. ** written back into the database) and if no readers are using the WAL
  372. ** (in other words, if there are no WAL_READ_LOCK(i) where i>0) then
  373. ** the writer will first "reset" the WAL back to the beginning and start
  374. ** writing new content beginning at frame 1.
  375. **
  376. ** We assume that 32-bit loads are atomic and so no locks are needed in
  377. ** order to read from any aReadMark[] entries.
  378. */
  379. struct WalCkptInfo {
  380. u32 nBackfill; /* Number of WAL frames backfilled into DB */
  381. u32 aReadMark[WAL_NREADER]; /* Reader marks */
  382. };
  383. //#define READMARK_NOT_USED 0xffffffff
  384. const int READMARK_NOT_USED = 0xffffffff;
  385. /* A block of WALINDEX_LOCK_RESERVED bytes beginning at
  386. ** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems
  387. ** only support mandatory file-locks, we do not read or write data
  388. ** from the region of the file on which locks are applied.
  389. */
  390. //#define WALINDEX_LOCK_OFFSET (sizeof(WalIndexHdr)*2 + sizeof(WalCkptInfo))
  391. //#define WALINDEX_LOCK_RESERVED 16
  392. //#define WALINDEX_HDR_SIZE (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)
  393. const int WALINDEX_LOCK_OFFSET = (sizeof(WalIndexHdr)*2 + sizeof(WalCkptInfo));
  394. const int WALINDEX_LOCK_RESERVED= 16;
  395. const int WALINDEX_HDR_SIZE = (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED);
  396. /* Size of header before each frame in wal */
  397. //#define WAL_FRAME_HDRSIZE 24
  398. const int WAL_FRAME_HDRSIZE =24;
  399. /* Size of write ahead log header, including checksum. */
  400. /* #define WAL_HDRSIZE 24 */
  401. //#define WAL_HDRSIZE 32
  402. const int WAL_HDRSIZE =32;
  403. /* WAL magic value. Either this value, or the same value with the least
  404. ** significant bit also set (WAL_MAGIC | 0x00000001) is stored in 32-bit
  405. ** big-endian format in the first 4 bytes of a WAL file.
  406. **
  407. ** If the LSB is set, then the checksums for each frame within the WAL
  408. ** file are calculated by treating all data as an array of 32-bit
  409. ** big-endian words. Otherwise, they are calculated by interpreting
  410. ** all data as 32-bit little-endian words.
  411. */
  412. //#define WAL_MAGIC 0x377f0682
  413. const int WAL_MAGIC = 0x377f0682;
  414. /*
  415. ** Return the offset of frame iFrame in the write-ahead log file,
  416. ** assuming a database page size of szPage bytes. The offset returned
  417. ** is to the start of the write-ahead log frame-header.
  418. */
  419. //#define walFrameOffset(iFrame, szPage) ( \
  420. // WAL_HDRSIZE + ((iFrame)-1)*(i64)((szPage)+WAL_FRAME_HDRSIZE) \
  421. //)
  422. static int walFrameOffset(iFrame, szPage) {
  423. return WAL_HDRSIZE + ((iFrame)-1)*(i64)((szPage)+WAL_FRAME_HDRSIZE);
  424. }
  425. /*
  426. ** An open write-ahead log file is represented by an instance of the
  427. ** following object.
  428. */
  429. struct Wal {
  430. sqlite3_vfs *pVfs; /* The VFS used to create pDbFd */
  431. sqlite3_file *pDbFd; /* File handle for the database file */
  432. sqlite3_file *pWalFd; /* File handle for WAL file */
  433. u32 iCallback; /* Value to pass to log callback (or 0) */
  434. int nWiData; /* Size of array apWiData */
  435. volatile u32 **apWiData; /* Pointer to wal-index content in memory */
  436. u16 szPage; /* Database page size */
  437. i16 readLock; /* Which read lock is being held. -1 for none */
  438. u8 exclusiveMode; /* Non-zero if connection is in exclusive mode */
  439. u8 writeLock; /* True if in a write transaction */
  440. u8 ckptLock; /* True if holding a checkpoint lock */
  441. u8 readOnly; /* True if the WAL file is open read-only */
  442. WalIndexHdr hdr; /* Wal-index header for current transaction */
  443. const char *zWalName; /* Name of WAL file */
  444. u32 nCkpt; /* Checkpoint sequence counter in the wal-header */
  445. #if SQLITE_DEBUG
  446. u8 lockError; /* True if a locking error has occurred */
  447. #endif
  448. };
  449. /*
  450. ** Each page of the wal-index mapping contains a hash-table made up of
  451. ** an array of HASHTABLE_NSLOT elements of the following type.
  452. */
  453. typedef u16 ht_slot;
  454. /*
  455. ** This structure is used to implement an iterator that loops through
  456. ** all frames in the WAL in database page order. Where two or more frames
  457. ** correspond to the same database page, the iterator visits only the
  458. ** frame most recently written to the WAL (in other words, the frame with
  459. ** the largest index).
  460. **
  461. ** The internals of this structure are only accessed by:
  462. **
  463. ** walIteratorInit() - Create a new iterator,
  464. ** walIteratorNext() - Step an iterator,
  465. ** walIteratorFree() - Free an iterator.
  466. **
  467. ** This functionality is used by the checkpoint code (see walCheckpoint()).
  468. */
  469. struct WalIterator {
  470. int iPrior; /* Last result returned from the iterator */
  471. int nSegment; /* Size of the aSegment[] array */
  472. struct WalSegment {
  473. int iNext; /* Next slot in aIndex[] not yet returned */
  474. ht_slot *aIndex; /* i0, i1, i2... such that aPgno[iN] ascend */
  475. u32 *aPgno; /* Array of page numbers. */
  476. int nEntry; /* Max size of aPgno[] and aIndex[] arrays */
  477. int iZero; /* Frame number associated with aPgno[0] */
  478. } aSegment[1]; /* One for every 32KB page in the WAL */
  479. };
  480. /*
  481. ** Define the parameters of the hash tables in the wal-index file. There
  482. ** is a hash-table following every HASHTABLE_NPAGE page numbers in the
  483. ** wal-index.
  484. **
  485. ** Changing any of these constants will alter the wal-index format and
  486. ** create incompatibilities.
  487. */
  488. //#define HASHTABLE_NPAGE 4096 /* Must be power of 2 */
  489. //#define HASHTABLE_HASH_1 383 /* Should be prime */
  490. //#define HASHTABLE_NSLOT (HASHTABLE_NPAGE*2) /* Must be a power of 2 */
  491. const int HASHTABLE_NPAGE = 4096 ;
  492. const int HASHTABLE_HASH_1 = 383 ;
  493. const int HASHTABLE_NSLOT = (HASHTABLE_NPAGE*2);
  494. /*
  495. ** The block of page numbers associated with the first hash-table in a
  496. ** wal-index is smaller than usual. This is so that there is a complete
  497. ** hash-table on each aligned 32KB page of the wal-index.
  498. */
  499. //#define HASHTABLE_NPAGE_ONE (HASHTABLE_NPAGE - (WALINDEX_HDR_SIZE/sizeof(u32)))
  500. const int HASHTABLE_NPAGE_ONE =(HASHTABLE_NPAGE - (WALINDEX_HDR_SIZE/sizeof(u32)));
  501. /* The wal-index is divided into pages of WALINDEX_PGSZ bytes each. */
  502. //#define WALINDEX_PGSZ ( \
  503. // sizeof(ht_slot)*HASHTABLE_NSLOT + HASHTABLE_NPAGE*sizeof(u32) \
  504. //)
  505. static int WALINDEX_PGSZ(){
  506. return sizeof(ht_slot)*HASHTABLE_NSLOT + HASHTABLE_NPAGE*sizeof(u32);
  507. }
  508. /*
  509. ** Obtain a pointer to the iPage'th page of the wal-index. The wal-index
  510. ** is broken into pages of WALINDEX_PGSZ bytes. Wal-index pages are
  511. ** numbered from zero.
  512. **
  513. ** If this call is successful, *ppPage is set to point to the wal-index
  514. ** page and SQLITE_OK is returned. If an error (an OOM or VFS error) occurs,
  515. ** then an SQLite error code is returned and *ppPage is set to 0.
  516. */
  517. static int walIndexPage(Wal *pWal, int iPage, volatile u32 **ppPage){
  518. int rc = SQLITE_OK;
  519. /* Enlarge the pWal.apWiData[] array if required */
  520. if( pWal.nWiData<=iPage ){
  521. int nByte = sizeof(u32*)*(iPage+1);
  522. volatile u32 **apNew;
  523. apNew = (volatile u32 **)sqlite3_realloc((void *)pWal.apWiData, nByte);
  524. if( !apNew ){
  525. *ppPage = 0;
  526. return SQLITE_NOMEM;
  527. }
  528. memset((void*)&apNew[pWal.nWiData], 0,
  529. sizeof(u32*)*(iPage+1-pWal.nWiData));
  530. pWal.apWiData = apNew;
  531. pWal.nWiData = iPage+1;
  532. }
  533. /* Request a pointer to the required page from the VFS */
  534. if( pWal.apWiData[iPage]==0 ){
  535. if( pWal.exclusiveMode==WAL_HEAPMEMORY_MODE ){
  536. pWal.apWiData[iPage] = (u32 volatile *)sqlite3MallocZero(WALINDEX_PGSZ);
  537. if( !pWal.apWiData[iPage] ) rc = SQLITE_NOMEM;
  538. }else{
  539. rc = sqlite3OsShmMap(pWal.pDbFd, iPage, WALINDEX_PGSZ,
  540. pWal.writeLock, (void volatile **)&pWal.apWiData[iPage]
  541. );
  542. }
  543. }
  544. *ppPage = pWal.apWiData[iPage];
  545. Debug.Assert( iPage==0 || *ppPage || rc!=SQLITE_OK );
  546. return rc;
  547. }
  548. /*
  549. ** Return a pointer to the WalCkptInfo structure in the wal-index.
  550. */
  551. static volatile WalCkptInfo *walCkptInfo(Wal *pWal){
  552. Debug.Assert( pWal.nWiData>0 && pWal.apWiData[0] );
  553. return (volatile WalCkptInfo*)&(pWal.apWiData[0][sizeof(WalIndexHdr)/2]);
  554. }
  555. /*
  556. ** Return a pointer to the WalIndexHdr structure in the wal-index.
  557. */
  558. static volatile WalIndexHdr *walIndexHdr(Wal *pWal){
  559. Debug.Assert( pWal.nWiData>0 && pWal.apWiData[0] );
  560. return (volatile WalIndexHdr*)pWal.apWiData[0];
  561. }
  562. /*
  563. ** The argument to this macro must be of type u32. On a little-endian
  564. ** architecture, it returns the u32 value that results from interpreting
  565. ** the 4 bytes as a big-endian value. On a big-endian architecture, it
  566. ** returns the value that would be produced by intepreting the 4 bytes
  567. ** of the input value as a little-endian integer.
  568. */
  569. //#define BYTESWAP32(x) ( \
  570. // (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8) \
  571. // + (((x)&0x00FF0000)>>8) + (((x)&0xFF000000)>>24) \
  572. //)
  573. static int BYTESWAP32(int x) { return
  574. (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8)
  575. + (((x)&0x00FF0000)>>8) + (((x)&0xFF000000)>>24) ;
  576. }
  577. /*
  578. ** Generate or extend an 8 byte checksum based on the data in
  579. ** array aByte[] and the initial values of aIn[0] and aIn[1] (or
  580. ** initial values of 0 and 0 if aIn==NULL).
  581. **
  582. ** The checksum is written back into aOut[] before returning.
  583. **
  584. ** nByte must be a positive multiple of 8.
  585. */
  586. static void walChecksumBytes(
  587. int nativeCksum, /* True for native byte-order, false for non-native */
  588. u8 *a, /* Content to be checksummed */
  589. int nByte, /* Bytes of content in a[]. Must be a multiple of 8. */
  590. const u32 *aIn, /* Initial checksum value input */
  591. u32 *aOut /* OUT: Final checksum value output */
  592. ){
  593. u32 s1, s2;
  594. u32 *aData = (u32 *)a;
  595. u32 *aEnd = (u32 *)&a[nByte];
  596. if( aIn ){
  597. s1 = aIn[0];
  598. s2 = aIn[1];
  599. }else{
  600. s1 = s2 = 0;
  601. }
  602. Debug.Assert( nByte>=8 );
  603. Debug.Assert( (nByte&0x00000007)==0 );
  604. if( nativeCksum ){
  605. do {
  606. s1 += *aData++ + s2;
  607. s2 += *aData++ + s1;
  608. }while( aData<aEnd );
  609. }else{
  610. do {
  611. s1 += BYTESWAP32(aData[0]) + s2;
  612. s2 += BYTESWAP32(aData[1]) + s1;
  613. aData += 2;
  614. }while( aData<aEnd );
  615. }
  616. aOut[0] = s1;
  617. aOut[1] = s2;
  618. }
  619. static void walShmBarrier(Wal *pWal){
  620. if( pWal.exclusiveMode!=WAL_HEAPMEMORY_MODE ){
  621. sqlite3OsShmBarrier(pWal.pDbFd);
  622. }
  623. }
  624. /*
  625. ** Write the header information in pWal.hdr into the wal-index.
  626. **
  627. ** The checksum on pWal.hdr is updated before it is written.
  628. */
  629. static void walIndexWriteHdr(Wal *pWal){
  630. volatile WalIndexHdr *aHdr = walIndexHdr(pWal);
  631. const int nCksum = offsetof(WalIndexHdr, aCksum);
  632. Debug.Assert( pWal.writeLock );
  633. pWal.hdr.isInit = 1;
  634. pWal.hdr.iVersion = WALINDEX_MAX_VERSION;
  635. walChecksumBytes(1, (u8*)&pWal.hdr, nCksum, 0, pWal.hdr.aCksum);
  636. memcpy((void *)&aHdr[1], (void *)&pWal.hdr, sizeof(WalIndexHdr));
  637. walShmBarrier(pWal);
  638. memcpy((void *)&aHdr[0], (void *)&pWal.hdr, sizeof(WalIndexHdr));
  639. }
  640. /*
  641. ** This function encodes a single frame header and writes it to a buffer
  642. ** supplied by the caller. A frame-header is made up of a series of
  643. ** 4-byte big-endian integers, as follows:
  644. **
  645. ** 0: Page number.
  646. ** 4: For commit records, the size of the database image in pages
  647. ** after the commit. For all other records, zero.
  648. ** 8: Salt-1 (copied from the wal-header)
  649. ** 12: Salt-2 (copied from the wal-header)
  650. ** 16: Checksum-1.
  651. ** 20: Checksum-2.
  652. */
  653. static void walEncodeFrame(
  654. Wal *pWal, /* The write-ahead log */
  655. u32 iPage, /* Database page number for frame */
  656. u32 nTruncate, /* New db size (or 0 for non-commit frames) */
  657. u8 *aData, /* Pointer to page data */
  658. u8 *aFrame /* OUT: Write encoded frame here */
  659. ){
  660. int nativeCksum; /* True for native byte-order checksums */
  661. u32 *aCksum = pWal.hdr.aFrameCksum;
  662. Debug.Assert( WAL_FRAME_HDRSIZE==24 );
  663. sqlite3Put4byte(&aFrame[0], iPage);
  664. sqlite3Put4byte(&aFrame[4], nTruncate);
  665. memcpy(&aFrame[8], pWal.hdr.aSalt, 8);
  666. nativeCksum = (pWal.hdr.bigEndCksum==SQLITE_BIGENDIAN);
  667. walChecksumBytes(nativeCksum, aFrame, 8, aCksum, aCksum);
  668. walChecksumBytes(nativeCksum, aData, pWal.szPage, aCksum, aCksum);
  669. sqlite3Put4byte(&aFrame[16], aCksum[0]);
  670. sqlite3Put4byte(&aFrame[20], aCksum[1]);
  671. }
  672. /*
  673. ** Check to see if the frame with header in aFrame[] and content
  674. ** in aData[] is valid. If it is a valid frame, fill *piPage and
  675. ** *pnTruncate and return true. Return if the frame is not valid.
  676. */
  677. static int walDecodeFrame(
  678. Wal *pWal, /* The write-ahead log */
  679. u32 *piPage, /* OUT: Database page number for frame */
  680. u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
  681. u8 *aData, /* Pointer to page data (for checksum) */
  682. u8 *aFrame /* Frame data */
  683. ){
  684. int nativeCksum; /* True for native byte-order checksums */
  685. u32 *aCksum = pWal.hdr.aFrameCksum;
  686. u32 pgno; /* Page number of the frame */
  687. Debug.Assert( WAL_FRAME_HDRSIZE==24 );
  688. /* A frame is only valid if the salt values in the frame-header
  689. ** match the salt values in the wal-header.
  690. */
  691. if( memcmp(&pWal.hdr.aSalt, &aFrame[8], 8)!=0 ){
  692. return 0;
  693. }
  694. /* A frame is only valid if the page number is creater than zero.
  695. */
  696. pgno = sqlite3Get4byte(&aFrame[0]);
  697. if( pgno==0 ){
  698. return 0;
  699. }
  700. /* A frame is only valid if a checksum of the WAL header,
  701. ** all prior frams, the first 16 bytes of this frame-header,
  702. ** and the frame-data matches the checksum in the last 8
  703. ** bytes of this frame-header.
  704. */
  705. nativeCksum = (pWal.hdr.bigEndCksum==SQLITE_BIGENDIAN);
  706. walChecksumBytes(nativeCksum, aFrame, 8, aCksum, aCksum);
  707. walChecksumBytes(nativeCksum, aData, pWal.szPage, aCksum, aCksum);
  708. if( aCksum[0]!=sqlite3Get4byte(&aFrame[16])
  709. || aCksum[1]!=sqlite3Get4byte(&aFrame[20])
  710. ){
  711. /* Checksum failed. */
  712. return 0;
  713. }
  714. /* If we reach this point, the frame is valid. Return the page number
  715. ** and the new database size.
  716. */
  717. *piPage = pgno;
  718. *pnTruncate = sqlite3Get4byte(&aFrame[4]);
  719. return 1;
  720. }
  721. #if (SQLITE_TEST) && (SQLITE_DEBUG)
  722. /*
  723. ** Names of locks. This routine is used to provide debugging output and is not
  724. ** a part of an ordinary build.
  725. */
  726. static const char *walLockName(int lockIdx){
  727. if( lockIdx==WAL_WRITE_LOCK ){
  728. return "WRITE-LOCK";
  729. }else if( lockIdx==WAL_CKPT_LOCK ){
  730. return "CKPT-LOCK";
  731. }else if( lockIdx==WAL_RECOVER_LOCK ){
  732. return "RECOVER-LOCK";
  733. }else{
  734. static char zName[15];
  735. sqlite3_snprintf(sizeof(zName), zName, "READ-LOCK[%d]",
  736. lockIdx-WAL_READ_LOCK(0));
  737. return zName;
  738. }
  739. }
  740. #endif //*defined(SQLITE_TEST) || defined(SQLITE_DEBUG) */
  741. /*
  742. ** Set or release locks on the WAL. Locks are either shared or exclusive.
  743. ** A lock cannot be moved directly between shared and exclusive - it must go
  744. ** through the unlocked state first.
  745. **
  746. ** In locking_mode=EXCLUSIVE, all of these routines become no-ops.
  747. */
  748. static int walLockShared(Wal *pWal, int lockIdx){
  749. int rc;
  750. if( pWal.exclusiveMode ) return SQLITE_OK;
  751. rc = sqlite3OsShmLock(pWal.pDbFd, lockIdx, 1,
  752. SQLITE_SHM_LOCK | SQLITE_SHM_SHARED);
  753. WALTRACE(("WAL%p: acquire SHARED-%s %s\n", pWal,
  754. walLockName(lockIdx), rc ? "failed" : "ok"));
  755. VVA_ONLY( pWal.lockError = (u8)(rc!=SQLITE_OK && rc!=SQLITE_BUSY); )
  756. return rc;
  757. }
  758. static void walUnlockShared(Wal *pWal, int lockIdx){
  759. if( pWal.exclusiveMode ) return;
  760. (void)sqlite3OsShmLock(pWal.pDbFd, lockIdx, 1,
  761. SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED);
  762. WALTRACE(("WAL%p: release SHARED-%s\n", pWal, walLockName(lockIdx)));
  763. }
  764. static int walLockExclusive(Wal *pWal, int lockIdx, int n){
  765. int rc;
  766. if( pWal.exclusiveMode ) return SQLITE_OK;
  767. rc = sqlite3OsShmLock(pWal.pDbFd, lockIdx, n,
  768. SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE);
  769. WALTRACE(("WAL%p: acquire EXCLUSIVE-%s cnt=%d %s\n", pWal,
  770. walLockName(lockIdx), n, rc ? "failed" : "ok"));
  771. VVA_ONLY( pWal.lockError = (u8)(rc!=SQLITE_OK && rc!=SQLITE_BUSY); )
  772. return rc;
  773. }
  774. static void walUnlockExclusive(Wal *pWal, int lockIdx, int n){
  775. if( pWal.exclusiveMode ) return;
  776. (void)sqlite3OsShmLock(pWal.pDbFd, lockIdx, n,
  777. SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE);
  778. WALTRACE(("WAL%p: release EXCLUSIVE-%s cnt=%d\n", pWal,
  779. walLockName(lockIdx), n));
  780. }
  781. /*
  782. ** Compute a hash on a page number. The resulting hash value must land
  783. ** between 0 and (HASHTABLE_NSLOT-1). The walHashNext() function advances
  784. ** the hash to the next value in the event of a collision.
  785. */
  786. static int walHash(u32 iPage){
  787. Debug.Assert( iPage>0 );
  788. Debug.Assert( (HASHTABLE_NSLOT & (HASHTABLE_NSLOT-1))==0 );
  789. return (iPage*HASHTABLE_HASH_1) & (HASHTABLE_NSLOT-1);
  790. }
  791. static int walNextHash(int iPriorHash){
  792. return (iPriorHash+1)&(HASHTABLE_NSLOT-1);
  793. }
  794. /*
  795. ** Return pointers to the hash table and page number array stored on
  796. ** page iHash of the wal-index. The wal-index is broken into 32KB pages
  797. ** numbered starting from 0.
  798. **
  799. ** Set output variable *paHash to point to the start of the hash table
  800. ** in the wal-index file. Set *piZero to one less than the frame
  801. ** number of the first frame indexed by this hash table. If a
  802. ** slot in the hash table is set to N, it refers to frame number
  803. ** (*piZero+N) in the log.
  804. **
  805. ** Finally, set *paPgno so that *paPgno[1] is the page number of the
  806. ** first frame indexed by the hash table, frame (*piZero+1).
  807. */
  808. static int walHashGet(
  809. Wal *pWal, /* WAL handle */
  810. int iHash, /* Find the iHash'th table */
  811. volatile ht_slot **paHash, /* OUT: Pointer to hash index */
  812. volatile u32 **paPgno, /* OUT: Pointer to page number array */
  813. u32 *piZero /* OUT: Frame associated with *paPgno[0] */
  814. ){
  815. int rc; /* Return code */
  816. volatile u32 *aPgno;
  817. rc = walIndexPage(pWal, iHash, &aPgno);
  818. Debug.Assert( rc==SQLITE_OK || iHash>0 );
  819. if( rc==SQLITE_OK ){
  820. u32 iZero;
  821. volatile ht_slot *aHash;
  822. aHash = (volatile ht_slot *)&aPgno[HASHTABLE_NPAGE];
  823. if( iHash==0 ){
  824. aPgno = &aPgno[WALINDEX_HDR_SIZE/sizeof(u32)];
  825. iZero = 0;
  826. }else{
  827. iZero = HASHTABLE_NPAGE_ONE + (iHash-1)*HASHTABLE_NPAGE;
  828. }
  829. *paPgno = &aPgno[-1];
  830. *paHash = aHash;
  831. *piZero = iZero;
  832. }
  833. return rc;
  834. }
  835. /*
  836. ** Return the number of the wal-index page that contains the hash-table
  837. ** and page-number array that contain entries corresponding to WAL frame
  838. ** iFrame. The wal-index is broken up into 32KB pages. Wal-index pages
  839. ** are numbered starting from 0.
  840. */
  841. static int walFramePage(u32 iFrame){
  842. int iHash = (iFrame+HASHTABLE_NPAGE-HASHTABLE_NPAGE_ONE-1) / HASHTABLE_NPAGE;
  843. Debug.Assert( (iHash==0 || iFrame>HASHTABLE_NPAGE_ONE)
  844. && (iHash>=1 || iFrame<=HASHTABLE_NPAGE_ONE)
  845. && (iHash<=1 || iFrame>(HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE))
  846. && (iHash>=2 || iFrame<=HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE)
  847. && (iHash<=2 || iFrame>(HASHTABLE_NPAGE_ONE+2*HASHTABLE_NPAGE))
  848. );
  849. return iHash;
  850. }
  851. /*
  852. ** Return the page number associated with frame iFrame in this WAL.
  853. */
  854. static u32 walFramePgno(Wal *pWal, u32 iFrame){
  855. int iHash = walFramePage(iFrame);
  856. if( iHash==0 ){
  857. return pWal.apWiData[0][WALINDEX_HDR_SIZE/sizeof(u32) + iFrame - 1];
  858. }
  859. return pWal.apWiData[iHash][(iFrame-1-HASHTABLE_NPAGE_ONE)%HASHTABLE_NPAGE];
  860. }
  861. /*
  862. ** Remove entries from the hash table that point to WAL slots greater
  863. ** than pWal.hdr.mxFrame.
  864. **
  865. ** This function is called whenever pWal.hdr.mxFrame is decreased due
  866. ** to a rollback or savepoint.
  867. **
  868. ** At most only the hash table containing pWal.hdr.mxFrame needs to be
  869. ** updated. Any later hash tables will be automatically cleared when
  870. ** pWal.hdr.mxFrame advances to the point where those hash tables are
  871. ** actually needed.
  872. */
  873. static void walCleanupHash(Wal *pWal){
  874. volatile ht_slot *aHash = 0; /* Pointer to hash table to clear */
  875. volatile u32 *aPgno = 0; /* Page number array for hash table */
  876. u32 iZero = 0; /* frame == (aHash[x]+iZero) */
  877. int iLimit = 0; /* Zero values greater than this */
  878. int nByte; /* Number of bytes to zero in aPgno[] */
  879. int i; /* Used to iterate through aHash[] */
  880. Debug.Assert( pWal.writeLock );
  881. testcase( pWal.hdr.mxFrame==HASHTABLE_NPAGE_ONE-1 );
  882. testcase( pWal.hdr.mxFrame==HASHTABLE_NPAGE_ONE );
  883. testcase( pWal.hdr.mxFrame==HASHTABLE_NPAGE_ONE+1 );
  884. if( pWal.hdr.mxFrame==0 ) return;
  885. /* Obtain pointers to the hash-table and page-number array containing
  886. ** the entry that corresponds to frame pWal.hdr.mxFrame. It is guaranteed
  887. ** that the page said hash-table and array reside on is already mapped.
  888. */
  889. Debug.Assert( pWal.nWiData>walFramePage(pWal.hdr.mxFrame) );
  890. Debug.Assert( pWal.apWiData[walFramePage(pWal.hdr.mxFrame)] );
  891. walHashGet(pWal, walFramePage(pWal.hdr.mxFrame), &aHash, &aPgno, &iZero);
  892. /* Zero all hash-table entries that correspond to frame numbers greater
  893. ** than pWal.hdr.mxFrame.
  894. */
  895. iLimit = pWal.hdr.mxFrame - iZero;
  896. Debug.Assert( iLimit>0 );
  897. for(i=0; i<HASHTABLE_NSLOT; i++){
  898. if( aHash[i]>iLimit ){
  899. aHash[i] = 0;
  900. }
  901. }
  902. /* Zero the entries in the aPgno array that correspond to frames with
  903. ** frame numbers greater than pWal.hdr.mxFrame.
  904. */
  905. nByte = (int)((char *)aHash - (char *)&aPgno[iLimit+1]);
  906. memset((void *)&aPgno[iLimit+1], 0, nByte);
  907. #if SQLITE_ENABLE_EXPENSIVE_ASSERT
  908. /* Verify that the every entry in the mapping region is still reachable
  909. ** via the hash table even after the cleanup.
  910. */
  911. if( iLimit ){
  912. int i; /* Loop counter */
  913. int iKey; /* Hash key */
  914. for(i=1; i<=iLimit; i++){
  915. for(iKey=walHash(aPgno[i]); aHash[iKey]; iKey=walNextHash(iKey)){
  916. if( aHash[iKey]==i ) break;
  917. }
  918. Debug.Assert( aHash[iKey]==i );
  919. }
  920. }
  921. #endif //* SQLITE_ENABLE_EXPENSIVE_ASSERT */
  922. }
  923. /*
  924. ** Set an entry in the wal-index that will map database page number
  925. ** pPage into WAL frame iFrame.
  926. */
  927. static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
  928. int rc; /* Return code */
  929. u32 iZero = 0; /* One less than frame number of aPgno[1] */
  930. volatile u32 *aPgno = 0; /* Page number array */
  931. volatile ht_slot *aHash = 0; /* Hash table */
  932. rc = walHashGet(pWal, walFramePage(iFrame), &aHash, &aPgno, &iZero);
  933. /* Assuming the wal-index file was successfully mapped, populate the
  934. ** page number array and hash table entry.
  935. */
  936. if( rc==SQLITE_OK ){
  937. int iKey; /* Hash table key */
  938. int idx; /* Value to write to hash-table slot */
  939. int nCollide; /* Number of hash collisions */
  940. idx = iFrame - iZero;
  941. Debug.Assert( idx <= HASHTABLE_NSLOT/2 + 1 );
  942. /* If this is the first entry to be added to this hash-table, zero the
  943. ** entire hash table and aPgno[] array before proceding.
  944. */
  945. if( idx==1 ){
  946. int nByte = (int)((u8 *)&aHash[HASHTABLE_NSLOT] - (u8 *)&aPgno[1]);
  947. memset((void*)&aPgno[1], 0, nByte);
  948. }
  949. /* If the entry in aPgno[] is already set, then the previous writer
  950. ** must have exited unexpectedly in the middle of a transaction (after
  951. ** writing one or more dirty pages to the WAL to free up memory).
  952. ** Remove the remnants of that writers uncommitted transaction from
  953. ** the hash-table before writing any new entries.
  954. */
  955. if( aPgno[idx] ){
  956. walCleanupHash(pWal);
  957. Debug.Assert( !aPgno[idx] );
  958. }
  959. /* Write the aPgno[] array entry and the hash-table slot. */
  960. nCollide = idx;
  961. for(iKey=walHash(iPage); aHash[iKey]; iKey=walNextHash(iKey)){
  962. if( (nCollide--)==0 ) return SQLITE_CORRUPT_BKPT;
  963. }
  964. aPgno[idx] = iPage;
  965. aHash[iKey] = (ht_slot)idx;
  966. #if SQLITE_ENABLE_EXPENSIVE_ASSERT
  967. /* Verify that the number of entries in the hash table exactly equals
  968. ** the number of entries in the mapping region.
  969. */
  970. {
  971. int i; /* Loop counter */
  972. int nEntry = 0; /* Number of entries in the hash table */
  973. for(i=0; i<HASHTABLE_NSLOT; i++){ if( aHash[i] ) nEntry++; }
  974. Debug.Assert( nEntry==idx );
  975. }
  976. /* Verify that the every entry in the mapping region is reachable
  977. ** via the hash table. This turns out to be a really, really expensive
  978. ** thing to check, so only do this occasionally - not on every
  979. ** iteration.
  980. */
  981. if( (idx&0x3ff)==0 ){
  982. int i; /* Loop counter */
  983. for(i=1; i<=idx; i++){
  984. for(iKey=walHash(aPgno[i]); aHash[iKey]; iKey=walNextHash(iKey)){
  985. if( aHash[iKey]==i ) break;
  986. }
  987. Debug.Assert( aHash[iKey]==i );
  988. }
  989. }
  990. #endif //* SQLITE_ENABLE_EXPENSIVE_ASSERT */
  991. }
  992. return rc;
  993. }
  994. /*
  995. ** Recover the wal-index by reading the write-ahead log file.
  996. **
  997. ** This routine first tries to establish an exclusive lock on the
  998. ** wal-index to prevent other threads/processes from doing anything
  999. ** with the WAL or wal-index while recovery is running. The
  1000. ** WAL_RECOVER_LOCK is also held so that other threads will know
  1001. ** that this thread is running recovery. If unable to establish
  1002. ** the necessary locks, this routine returns SQLITE_BUSY.
  1003. */
  1004. static int walIndexRecover(Wal *pWal){
  1005. int rc; /* Return Code */
  1006. i64 nSize; /* Size of log file */
  1007. u32 aFrameCksum[2] = {0, 0};
  1008. int iLock; /* Lock offset to lock for checkpoint */
  1009. int nLock; /* Number of locks to hold */
  1010. /* Obtain an exclusive lock on all byte in the locking range not already
  1011. ** locked by the caller. The caller is guaranteed to have locked the
  1012. ** WAL_WRITE_LOCK byte, and may have also locked the WAL_CKPT_LOCK byte.
  1013. ** If successful, the same bytes that are locked here are unlocked before
  1014. ** this function returns.
  1015. */
  1016. Debug.Assert( pWal.ckptLock==1 || pWal.ckptLock==0 );
  1017. Debug.Assert( WAL_ALL_BUT_WRITE==WAL_WRITE_LOCK+1 );
  1018. Debug.Assert( WAL_CKPT_LOCK==WAL_ALL_BUT_WRITE );
  1019. Debug.Assert( pWal.writeLock );
  1020. iLock = WAL_ALL_BUT_WRITE + pWal.ckptLock;
  1021. nLock = SQLITE_SHM_NLOCK - iLock;
  1022. rc = walLockExclusive(pWal, iLock, nLock);
  1023. if( rc ){
  1024. return rc;
  1025. }
  1026. WALTRACE(("WAL%p: recovery begin...\n", pWal));
  1027. memset(&pWal.hdr, 0, sizeof(WalIndexHdr));
  1028. rc = sqlite3OsFileSize(pWal.pWalFd, &nSize);
  1029. if( rc!=SQLITE_OK ){
  1030. goto recovery_error;
  1031. }
  1032. if( nSize>WAL_HDRSIZE ){
  1033. u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */
  1034. u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
  1035. int szFrame; /* Number of bytes in buffer aFrame[] */
  1036. u8 *aData; /* Pointer to data part of aFrame buffer */
  1037. int iFrame; /* Index of last frame read */
  1038. i64 iOffset; /* Next offset to read from log file */
  1039. int szPage; /* Page size according to the log */
  1040. u32 magic; /* Magic value read from WAL header */
  1041. u32 version; /* Magic value read from WAL header */
  1042. /* Read in the WAL header. */
  1043. rc = sqlite3OsRead(pWal.pWalFd, aBuf, WAL_HDRSIZE, 0);
  1044. if( rc!=SQLITE_OK ){
  1045. goto recovery_error;
  1046. }
  1047. /* If the database page size is not a power of two, or is greater than
  1048. ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid
  1049. ** data. Similarly, if the 'magic' value is invalid, ignore the whole
  1050. ** WAL file.
  1051. */
  1052. magic = sqlite3Get4byte(&aBuf[0]);
  1053. szPage = sqlite3Get4byte(&aBuf[8]);
  1054. if( (magic&0xFFFFFFFE)!=WAL_MAGIC
  1055. || szPage&(szPage-1)
  1056. || szPage>SQLITE_MAX_PAGE_SIZE
  1057. || szPage<512
  1058. ){
  1059. goto finished;
  1060. }
  1061. pWal.hdr.bigEndCksum = (u8)(magic&0x00000001);
  1062. pWal.szPage = (u16)szPage;
  1063. pWal.nCkpt = sqlite3Get4byte(&aBuf[12]);
  1064. memcpy(&pWal.hdr.aSalt, &aBuf[16], 8);
  1065. /* Verify that the WAL header checksum is correct */
  1066. walChecksumBytes(pWal.hdr.bigEndCksum==SQLITE_BIGENDIAN,
  1067. aBuf, WAL_HDRSIZE-2*4, 0, pWal.hdr.aFrameCksum
  1068. );
  1069. if( pWal.hdr.aFrameCksum[0]!=sqlite3Get4byte(&aBuf[24])
  1070. || pWal.hdr.aFrameCksum[1]!=sqlite3Get4byte(&aBuf[28])
  1071. ){
  1072. goto finished;
  1073. }
  1074. /* Verify that the version number on the WAL format is one that
  1075. ** are able to understand */
  1076. version = sqlite3Get4byte(&aBuf[4]);
  1077. if( version!=WAL_MAX_VERSION ){
  1078. rc = SQLITE_CANTOPEN_BKPT;
  1079. goto finished;
  1080. }
  1081. /* Malloc a buffer to read frames into. */
  1082. szFrame = szPage + WAL_FRAME_HDRSIZE;
  1083. aFrame = (u8 *)sqlite3_malloc(szFrame);
  1084. if( !aFrame ){
  1085. rc = SQLITE_NOMEM;
  1086. goto recovery_error;
  1087. }
  1088. aData = &aFrame[WAL_FRAME_HDRSIZE];
  1089. /* Read all frames from the log file. */
  1090. iFrame = 0;
  1091. for(iOffset=WAL_HDRSIZE; (iOffset+szFrame)<=nSize; iOffset+=szFrame){
  1092. u32 pgno; /* Database page number for frame */
  1093. u32 nTruncate; /* dbsize field from frame header */
  1094. int isValid; /* True if this frame is valid */
  1095. /* Read and decode the next log frame. */
  1096. rc = sqlite3OsRead(pWal.pWalFd, aFrame, szFrame, iOffset);
  1097. if( rc!=SQLITE_OK ) break;
  1098. isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame);
  1099. if( !isValid ) break;
  1100. rc = walIndexAppend(pWal, ++iFrame, pgno);
  1101. if( rc!=SQLITE_OK ) break;
  1102. /* If nTruncate is non-zero, this is a commit record. */
  1103. if( nTruncate ){
  1104. pWal.hdr.mxFrame = iFrame;
  1105. pWal.hdr.nPage = nTruncate;
  1106. pWal.hdr.szPage = (u16)szPage;
  1107. aFrameCksum[0] = pWal.hdr.aFrameCksum[0];
  1108. aFrameCksum[1] = pWal.hdr.aFrameCksum[1];
  1109. }
  1110. }
  1111. sqlite3_free(aFrame);
  1112. }
  1113. finished:
  1114. if( rc==SQLITE_OK ){
  1115. volatile WalCkptInfo *pInfo;
  1116. int i;
  1117. pWal.hdr.aFrameCksum[0] = aFrameCksum[0];
  1118. pWal.hdr.aFrameCksum[1] = aFrameCksum[1];
  1119. walIndexWriteHdr(pWal);
  1120. /* Reset the checkpoint-header. This is safe because this thread is
  1121. ** currently holding locks that exclude all other readers, writers and
  1122. ** checkpointers.
  1123. */
  1124. pInfo = walCkptInfo(pWal);
  1125. pInfo.nBackfill = 0;
  1126. pInfo.aReadMark[0] = 0;
  1127. for(i=1; i<WAL_NREADER; i++) pInfo.aReadMark[i] = READMARK_NOT_USED;
  1128. }
  1129. recovery_error:
  1130. WALTRACE(("WAL%p: recovery %s\n", pWal, rc ? "failed" : "ok"));
  1131. walUnlockExclusive(pWal, iLock, nLock);
  1132. return rc;
  1133. }
  1134. /*
  1135. ** Close an open wal-index.
  1136. */
  1137. static void walIndexClose(Wal *pWal, int isDelete){
  1138. if( pWal.exclusiveMode==WAL_HEAPMEMORY_MODE ){
  1139. int i;
  1140. for(i=0; i<pWal.nWiData; i++){
  1141. sqlite3_free((void *)pWal.apWiData[i]);
  1142. pWal.apWiData[i] = 0;
  1143. }
  1144. }else{
  1145. sqlite3OsShmUnmap(pWal.pDbFd, isDelete);
  1146. }
  1147. }
  1148. /*
  1149. ** Open a connection to the WAL file zWalName. The database file must
  1150. ** already be opened on connection pDbFd. The buffer that zWalName points
  1151. ** to must remain valid for the lifetime of the returned Wal* handle.
  1152. **
  1153. ** A SHARED lock should be held on the database file when this function
  1154. ** is called. The purpose of this SHARED lock is to prevent any other
  1155. ** client from unlinking the WAL or wal-index file. If another process
  1156. ** were to do this just after this client opened one of these files, the
  1157. ** system would be badly broken.
  1158. **
  1159. ** If the log file is successfully opened, SQLITE_OK is returned and
  1160. ** *ppWal is set to point to a new WAL handle. If an error occurs,
  1161. ** an SQLite error code is returned and *ppWal is left unmodified.
  1162. */
  1163. int sqlite3WalOpen(
  1164. sqlite3_vfs *pVfs, /* vfs module to open wal and wal-index */
  1165. sqlite3_file *pDbFd, /* The open database file */
  1166. const char *zWalName, /* Name of the WAL file */
  1167. int bNoShm, /* True to run in heap-memory mode */
  1168. Wal **ppWal /* OUT: Allocated Wal handle */
  1169. ){
  1170. int rc; /* Return Code */
  1171. Wal *pRet; /* Object to allocate and return */
  1172. int flags; /* Flags passed to OsOpen() */
  1173. Debug.Assert( zWalName && zWalName[0] );
  1174. Debug.Assert( pDbFd );
  1175. /* In the amalgamation, the os_unix.c and os_win.c source files come before
  1176. ** this source file. Verify that the #defines of the locking byte offsets
  1177. ** in os_unix.c and os_win.c agree with the WALINDEX_LOCK_OFFSET value.
  1178. */
  1179. #if WIN_SHM_BASE
  1180. Debug.Assert( WIN_SHM_BASE==WALINDEX_LOCK_OFFSET );
  1181. #endif
  1182. #if UNIX_SHM_BASE
  1183. Debug.Assert( UNIX_SHM_BASE==WALINDEX_LOCK_OFFSET );
  1184. #endif
  1185. /* Allocate an instance of struct Wal to return. */
  1186. *ppWal = 0;
  1187. pRet = (Wal*)sqlite3MallocZero(sizeof(Wal) + pVfs.szOsFile);
  1188. if( !pRet ){
  1189. return SQLITE_NOMEM;
  1190. }
  1191. pRet.pVfs = pVfs;
  1192. pRet.pWalFd = (sqlite3_file *)&pRet[1];
  1193. pRet.pDbFd = pDbFd;
  1194. pRet.readLock = -1;
  1195. pRet.zWalName = zWalName;
  1196. pRet.exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE);
  1197. /* Open file handle on the write-ahead log file. */
  1198. flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL);
  1199. rc = sqlite3OsOpen(pVfs, zWalName, pRet.pWalFd, flags, &flags);
  1200. if( rc==SQLITE_OK && flags&SQLITE_OPEN_READONLY ){
  1201. pRet.readOnly = 1;
  1202. }
  1203. if( rc!=SQLITE_OK ){
  1204. walIndexClose(pRet, 0);
  1205. sqlite3OsClose(pRet.pWalFd);
  1206. sqlite3_free(pRet);
  1207. }else{
  1208. *ppWal = pRet;
  1209. WALTRACE(("WAL%d: opened\n", pRet));
  1210. }
  1211. return rc;
  1212. }
  1213. /*
  1214. ** Find the smallest page number out of all pages held in the WAL that
  1215. ** has not been returned by any prior invocation of this method on the
  1216. ** same WalIterator object. Write into *piFrame the frame index where
  1217. ** that page was last written into the WAL. Write into *piPage the page
  1218. ** number.
  1219. **
  1220. ** Return 0 on success. If there are no pages in the WAL with a page
  1221. ** number larger than *piPage, then return 1.
  1222. */
  1223. static int walIteratorNext(
  1224. WalIterator *p, /* Iterator */
  1225. u32 *piPage, /* OUT: The page number of the next page */
  1226. u32 *piFrame /* OUT: Wal frame index of next page */
  1227. ){
  1228. u32 iMin; /* Result pgno must be greater than iMin */
  1229. u32 iRet = 0xFFFFFFFF; /* 0xffffffff is never a valid page number */
  1230. int i; /* For looping through segments */
  1231. iMin = p.iPrior;
  1232. Debug.Assert( iMin<0xffffffff );
  1233. for(i=p.nSegment-1; i>=0; i--){
  1234. struct WalSegment *pSegment = &p.aSegment[i];
  1235. while( pSegment.iNext<pSegment.nEntry ){
  1236. u32 iPg = pSegment.aPgno[pSegment.aIndex[pSegment.iNext]];
  1237. if( iPg>iMin ){
  1238. if( iPg<iRet ){
  1239. iRet = iPg;
  1240. *piFrame = pSegment.iZero + pSegment.aIndex[pSegment.iNext];
  1241. }
  1242. break;
  1243. }
  1244. pSegment.iNext++;
  1245. }
  1246. }
  1247. *piPage = p.iPrior = iRet;
  1248. return (iRet==0xFFFFFFFF);
  1249. }
  1250. /*
  1251. ** This function merges two sorted lists into a single sorted list.
  1252. */
  1253. static void walMerge(
  1254. u32 *aContent, /* Pages in wal */
  1255. ht_slot *aLeft, /* IN: Left hand input list */
  1256. int nLeft, /* IN: Elements in array *paLeft */
  1257. ht_slot **paRight, /* IN/OUT: Right hand input list */
  1258. int *pnRight, /* IN/OUT: Elements in *paRight */
  1259. ht_slot *aTmp /* Temporary buffer */
  1260. ){
  1261. int iLeft = 0; /* Current index in aLeft */
  1262. int iRight = 0; /* Current index in aRight */
  1263. int iOut = 0; /* Current index in output buffer */
  1264. int nRight = *pnRight;
  1265. ht_slot *aRight = *paRight;
  1266. Debug.Assert( nLeft>0 && nRight>0 );
  1267. while( iRight<nRight || iLeft<nLeft ){
  1268. ht_slot logpage;
  1269. Pgno dbpage;
  1270. if( (iLeft<nLeft)
  1271. && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
  1272. ){
  1273. logpage = aLeft[iLeft++];
  1274. }else{
  1275. logpage = aRight[iRight++];
  1276. }
  1277. dbpage = aContent[logpage];
  1278. aTmp[iOut++] = logpage;
  1279. if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
  1280. Debug.Assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
  1281. Debug.Assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
  1282. }
  1283. *paRight = aLeft;
  1284. *pnRight = iOut;
  1285. memcpy(aLeft, aTmp, sizeof(aTmp[0])*iOut);
  1286. }
  1287. /*
  1288. ** Sort the elements in list aList, removing any duplicates.
  1289. */
  1290. static void walMergesort(
  1291. u32 *aContent, /* Pages in wal */
  1292. ht_slot *aBuffer, /* Buffer of at least *pnList items to use */
  1293. ht_slot *aList, /* IN/OUT: List to sort */
  1294. int *pnList /* IN/OUT: Number of elements in aList[] */
  1295. ){
  1296. struct Sublist {
  1297. int nList; /* Number of elements in aList */
  1298. ht_slot *aList; /* Pointer to sub-list content */
  1299. };
  1300. const int nList = *pnList; /* Size of input list */
  1301. int nMerge = 0; /* Number of elements in list aMerge */
  1302. ht_slot *aMerge = 0; /* List to be merged */
  1303. int iList; /* Index into input list */
  1304. int iSub = 0; /* Index into aSub array */
  1305. struct Sublist aSub[13]; /* Array of sub-lists */
  1306. memset(aSub, 0, sizeof(aSub));
  1307. Debug.Assert( nList<=HASHTABLE_NPAGE && nList>0 );
  1308. Debug.Assert( HASHTABLE_NPAGE==(1<<(ArraySize(aSub)-1)) );
  1309. for(iList=0; iList<nList; iList++){
  1310. nMerge = 1;
  1311. aMerge = &aList[iList];
  1312. for(iSub=0; iList & (1<<iSub); iSub++){
  1313. struct Sublist *p = &aSub[iSub];
  1314. Debug.Assert( p.aList && p.nList<=(1<<iSub) );
  1315. Debug.Assert( p.aList==&aList[iList&~((2<<iSub)-1)] );
  1316. walMerge(aContent, p.aList, p.nList, &aMerge, &nMerge, aBuffer);
  1317. }
  1318. aSub[iSub].aList = aMerge;
  1319. aSub[iSub].nList = nMerge;
  1320. }
  1321. for(iSub++; iSub<ArraySize(aSub); iSub++){
  1322. if( nList & (1<<iSub) ){
  1323. struct Sublist *p = &aSub[iSub];
  1324. Debug.Assert( p.nList<=(1<<iSub) );
  1325. Debug.Assert( p.aList==&aList[nList&~((2<<iSub)-1)] );
  1326. walMerge(aContent, p.aList, p.nList, &aMerge, &nMerge, aBuffer);
  1327. }
  1328. }
  1329. Debug.Assert( aMerge==aList );
  1330. *pnList = nMerge;
  1331. #if SQLITE_DEBUG
  1332. {
  1333. int i;
  1334. for(i=1; i<*pnList; i++){
  1335. Debug.Assert( aContent[aList[i]] > aContent[aList[i-1]] );
  1336. }
  1337. }
  1338. #endif
  1339. }
  1340. /*
  1341. ** Free an iterator allocated by walIteratorInit().
  1342. */
  1343. static void walIteratorFree(WalIterator *p){
  1344. sqlite3ScratchFree(p);
  1345. }
  1346. /*
  1347. ** Construct a WalInterator object that can be used to loop over all
  1348. ** pages in the WAL in ascending order. The caller must hold the checkpoint
  1349. **
  1350. ** On success, make *pp point to the newly allocated WalInterator object
  1351. ** return SQLITE_OK. Otherwise, return an error code. If this routine
  1352. ** returns an error, the value of *pp is undefined.
  1353. **
  1354. ** The calling routine should invoke walIteratorFree() to destroy the
  1355. ** WalIterator object when it has finished with it.
  1356. */
  1357. static int walIteratorInit(Wal *pWal, WalIterator **pp){
  1358. WalIterator *p; /* Return value */
  1359. int nSegment; /* Number of segments to merge */
  1360. u32 iLast; /* Last frame in log */
  1361. int nByte; /* Number of bytes to allocate */
  1362. int i; /* Iterator variable */
  1363. ht_slot *aTmp; /* Temp space used by merge-sort */
  1364. int rc = SQLITE_OK; /* Return Code */
  1365. /* This routine only runs while holding the checkpoint lock. And
  1366. ** it only runs if there is actually content in the log (mxFrame>0).
  1367. */
  1368. Debug.Assert( pWal.ckptLock && pWal.hdr.mxFrame>0 );
  1369. iLast = pWal.hdr.mxFrame;
  1370. /* Allocate space for the WalIterator object. */
  1371. nSegment = walFramePage(iLast) + 1;
  1372. nByte = sizeof(WalIterator)
  1373. + (nSegment-1)*sizeof(struct WalSegment)
  1374. + iLast*sizeof(ht_slot);
  1375. p = (WalIterator *)sqlite3ScratchMalloc(nByte);
  1376. if( !p ){
  1377. return SQLITE_NOMEM;
  1378. }
  1379. memset(p, 0, nByte);
  1380. p.nSegment = nSegment;
  1381. /* Allocate temporary space used by the merge-sort routine. This block
  1382. ** of memory will be freed before this function returns.
  1383. */
  1384. aTmp = (ht_slot *)sqlite3ScratchMalloc(
  1385. sizeof(ht_slot) * (iLast>HASHTABLE_NPAGE?HASHTABLE_NPAGE:iLast)
  1386. );
  1387. if( !aTmp ){
  1388. rc = SQLITE_NOMEM;
  1389. }
  1390. for(i=0; rc==SQLITE_OK && i<nSegment; i++){
  1391. volatile ht_slot *aHash;
  1392. u32 iZero;
  1393. volatile u32 *aPgno;
  1394. rc = walHashGet(pWal, i, &aHash, &aPgno, &iZero);
  1395. if( rc==SQLITE_OK ){
  1396. int j; /* Counter variable */
  1397. int nEntry; /* Number of entries in this segment */
  1398. ht_slot *aIndex; /* Sorted index for this segment */
  1399. aPgno++;
  1400. if( (i+1)==nSegment ){
  1401. nEntry = (int)(iLast - iZero);
  1402. }else{
  1403. nEntry = (int)((u32*)aHash - (u32*)aPgno);
  1404. }
  1405. aIndex = &((ht_slot *)&p.aSegment[p.nSegment])[iZero];
  1406. iZero++;
  1407. for(j=0; j<nEntry; j++){
  1408. aIndex[j] = (ht_slot)j;
  1409. }
  1410. walMergesort((u32 *)aPgno, aTmp, aIndex, &nEntry);
  1411. p.aSegment[i].iZero = iZero;
  1412. p.aSegment[i].nEntry = nEntry;
  1413. p.aSegment[i].aIndex = aIndex;
  1414. p.aSegment[i].aPgno = (u32 *)aPgno;
  1415. }
  1416. }
  1417. sqlite3ScratchFree(aTmp);
  1418. if( rc!=SQLITE_OK ){
  1419. walIteratorFree(p);
  1420. }
  1421. *pp = p;
  1422. return rc;
  1423. }
  1424. /*
  1425. ** Copy as much content as we can from the WAL back into the database file
  1426. ** in response to an sqlite3_wal_checkpoint() request or the equivalent.
  1427. **
  1428. ** The amount of information copies from WAL to database might be limited
  1429. ** by active readers. This routine will never overwrite a database page
  1430. ** that a concurrent reader might be using.
  1431. **
  1432. ** All I/O barrier operations (a.k.a fsyncs) occur in this routine when
  1433. ** SQLite is in WAL-mode in synchronous=NORMAL. That means that if
  1434. ** checkpoints are always run by a background thread or background
  1435. ** process, foreground threads will never block on a lengthy fsync call.
  1436. **
  1437. ** Fsync is called on the WAL before writing content out of the WAL and
  1438. ** into the database. This ensures that if the new content is persistent
  1439. ** in the WAL and can be recovered following a power-loss or hard reset.
  1440. **
  1441. ** Fsync is also called on the database file if (and only if) the entire
  1442. ** WAL content is copied into the database file. This second fsync makes
  1443. ** it safe to delete the WAL since the new content will persist in the
  1444. ** database file.
  1445. **
  1446. ** This routine uses and updates the nBackfill field of the wal-index header.
  1447. ** This is the only routine tha will increase the value of nBackfill.
  1448. ** (A WAL reset or recovery will revert nBackfill to zero, but not increase
  1449. ** its value.)
  1450. **
  1451. ** The caller must be holding sufficient locks to ensure that no other
  1452. ** checkpoint is running (in any other thread or process) at the same
  1453. ** time.
  1454. */
  1455. static int walCheckpoint(
  1456. Wal *pWal, /* Wal connection */
  1457. int sync_flags, /* Flags for OsSync() (or 0) */
  1458. int nBuf, /* Size of zBuf in bytes */
  1459. u8 *zBuf /* Temporary buffer to use */
  1460. ){
  1461. int rc; /* Return code */
  1462. int szPage = pWal.hdr.szPage; /* Database page-size */
  1463. WalIterator *pIter = 0; /* Wal iterator context */
  1464. u32 iDbpage = 0; /* Next database page to write */
  1465. u32 iFrame = 0; /* Wal frame containing data for iDbpage */
  1466. u32 mxSafeFrame; /* Max frame that can be backfilled */
  1467. int i; /* Loop counter */
  1468. volatile WalCkptInfo *pInfo; /* The checkpoint status information */
  1469. if( pWal.hdr.mxFrame==0 ) return SQLITE_OK;
  1470. /* Allocate the iterator */
  1471. rc = walIteratorInit(pWal, &pIter);
  1472. if( rc!=SQLITE_OK ){
  1473. return rc;
  1474. }
  1475. Debug.Assert( pIter );
  1476. /*** TODO: Move this test out to the caller. Make it an Debug.Assert() here ***/
  1477. if( pWal.hdr.szPage!=nBuf ){
  1478. rc = SQLITE_CORRUPT_BKPT;
  1479. goto walcheckpoint_out;
  1480. }
  1481. /* Compute in mxSafeFrame the index of the last frame of the WAL that is
  1482. ** safe to write into the database. Frames beyond mxSafeFrame might
  1483. ** overwrite database pages that are in use by active readers and thus
  1484. ** cannot be backfilled from the WAL.
  1485. */
  1486. mxSafeFrame = pWal.hdr.mxFrame;
  1487. pInfo = walCkptInfo(pWal);
  1488. for(i=1; i<WAL_NREADER; i++){
  1489. u32 y = pInfo.aReadMark[i];
  1490. if( mxSafeFrame>=y ){
  1491. Debug.Assert( y<=pWal.hdr.mxFrame );
  1492. rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
  1493. if( rc==SQLITE_OK ){
  1494. pInfo.aReadMark[i] = READMARK_NOT_USED;
  1495. walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
  1496. }else if( rc==SQLITE_BUSY ){
  1497. mxSafeFrame = y;
  1498. }else{
  1499. goto walcheckpoint_out;
  1500. }
  1501. }
  1502. }
  1503. if( pInfo.nBackfill<mxSafeFrame
  1504. && (rc = walLockExclusive(pWal, WAL_READ_LOCK(0), 1))==SQLITE_OK
  1505. ){
  1506. u32 nBackfill = pInfo.nBackfill;
  1507. /* Sync the WAL to disk */
  1508. if( sync_flags ){
  1509. rc = sqlite3OsSync(pWal.pWalFd, sync_flags);
  1510. }
  1511. /* Iterate through the contents of the WAL, copying data to the db file. */
  1512. while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
  1513. i64 iOffset;
  1514. Debug.Assert( walFramePgno(pWal, iFrame)==iDbpage );
  1515. if( iFrame<=nBackfill || iFrame>mxSafeFrame ) continue;
  1516. iOffset = walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE;
  1517. /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL file */
  1518. rc = sqlite3OsRead(pWal.pWalFd, zBuf, szPage, iOffset);
  1519. if( rc!=SQLITE_OK ) break;
  1520. iOffset = (iDbpage-1)*(i64)szPage;
  1521. testcase( IS_BIG_INT(iOffset) );
  1522. rc = sqlite3OsWrite(pWal.pDbFd, zBuf, szPage, iOffset);
  1523. if( rc!=SQLITE_OK ) break;
  1524. }
  1525. /* If work was actually accomplished... */
  1526. if( rc==SQLITE_OK ){
  1527. if( mxSafeFrame==walIndexHdr(pWal).mxFrame ){
  1528. i64 szDb = pWal.hdr.nPage*(i64)szPage;
  1529. testcase( IS_BIG_INT(szDb) );
  1530. rc = sqlite3OsTruncate(pWal.pDbFd, szDb);
  1531. if( rc==SQLITE_OK && sync_flags ){
  1532. rc = sqlite3OsSync(pWal.pDbFd, sync_flags);
  1533. }
  1534. }
  1535. if( rc==SQLITE_OK ){
  1536. pInfo.nBackfill = mxSafeFrame;
  1537. }
  1538. }
  1539. /* Release the reader lock held while backfilling */
  1540. walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1);
  1541. }else if( rc==SQLITE_BUSY ){
  1542. /* Reset the return code so as not to report a checkpoint failure
  1543. ** just because active readers prevent any backfill.
  1544. */
  1545. rc = SQLITE_OK;
  1546. }
  1547. walcheckpoint_out:
  1548. walIteratorFree(pIter);
  1549. return rc;
  1550. }
  1551. /*
  1552. ** Close a connection to a log file.
  1553. */
  1554. int sqlite3WalClose(
  1555. Wal *pWal, /* Wal to close */
  1556. int sync_flags, /* Flags to pass to OsSync() (or 0) */
  1557. int nBuf,
  1558. u8 *zBuf /* Buffer of at least nBuf bytes */
  1559. ){
  1560. int rc = SQLITE_OK;
  1561. if( pWal ){
  1562. int isDelete = 0; /* True to unlink wal and wal-index files */
  1563. /* If an EXCLUSIVE lock can be obtained on the database file (using the
  1564. ** ordinary, rollback-mode locking methods, this guarantees that the
  1565. ** connection associated with this log file is the only connection to
  1566. ** the database. In this case checkpoint the database and unlink both
  1567. ** the wal and wal-index files.
  1568. **
  1569. ** The EXCLUSIVE lock is not released before returning.
  1570. */
  1571. rc = sqlite3OsLock(pWal.pDbFd, SQLITE_LOCK_EXCLUSIVE);
  1572. if( rc==SQLITE_OK ){
  1573. if( pWal.exclusiveMode==WAL_NORMAL_MODE ){
  1574. pWal.exclusiveMode = WAL_EXCLUSIVE_MODE;
  1575. }
  1576. rc = sqlite3WalCheckpoint(pWal, sync_flags, nBuf, zBuf);
  1577. if( rc==SQLITE_OK ){
  1578. isDelete = 1;
  1579. }
  1580. }
  1581. walIndexClose(pWal, isDelete);
  1582. sqlite3OsClose(pWal.pWalFd);
  1583. if( isDelete ){
  1584. sqlite3OsDelete(pWal.pVfs, pWal.zWalName, 0);
  1585. }
  1586. WALTRACE(("WAL%p: closed\n", pWal));
  1587. sqlite3_free((void *)pWal.apWiData);
  1588. sqlite3_free(pWal);
  1589. }
  1590. return rc;
  1591. }
  1592. /*
  1593. ** Try to read the wal-index header. Return 0 on success and 1 if
  1594. ** there is a problem.
  1595. **
  1596. ** The wal-index is in shared memory. Another thread or process might
  1597. ** be writing the header at the same time this procedure is trying to
  1598. ** read it, which might result in inconsistency. A dirty read is detected
  1599. ** by verifying that both copies of the header are the same and also by
  1600. ** a checksum on the header.
  1601. **
  1602. ** If and only if the read is consistent and the header is different from
  1603. ** pWal.hdr, then pWal.hdr is updated to the content of the new header
  1604. ** and *pChanged is set to 1.
  1605. **
  1606. ** If the checksum cannot be verified return non-zero. If the header
  1607. ** is read successfully and the checksum verified, return zero.
  1608. */
  1609. static int walIndexTryHdr(Wal *pWal, int *pChanged){
  1610. u32 aCksum[2]; /* Checksum on the header content */
  1611. WalIndexHdr h1, h2; /* Two copies of the header content */
  1612. WalIndexHdr volatile *aHdr; /* Header in shared memory */
  1613. /* The first page of the wal-index must be mapped at this point. */
  1614. Debug.Assert( pWal.nWiData>0 && pWal.apWiData[0] );
  1615. /* Read the header. This might happen currently with a write to the
  1616. ** same area of shared memory on a different CPU in a SMP,
  1617. ** meaning it is possible that an inconsistent snapshot is read
  1618. ** from the file. If this happens, return non-zero.
  1619. **
  1620. ** There are two copies of the header at the beginning of the wal-index.
  1621. ** When reading, read [0] first then [1]. Writes are in the reverse order.
  1622. ** Memory barriers are used to prevent the compiler or the hardware from
  1623. ** reordering the reads and writes.
  1624. */
  1625. aHdr = walIndexHdr(pWal);
  1626. memcpy(&h1, (void *)&aHdr[0], sizeof(h1));
  1627. walShmBarrier(pWal);
  1628. memcpy(&h2, (void *)&aHdr[1], sizeof(h2));
  1629. if( memcmp(&h1, &h2, sizeof(h1))!=0 ){
  1630. return 1; /* Dirty read */
  1631. }
  1632. if( h1.isInit==0 ){
  1633. return 1; /* Malformed header - probably all zeros */
  1634. }
  1635. walChecksumBytes(1, (u8*)&h1, sizeof(h1)-sizeof(h1.aCksum), 0, aCksum);
  1636. if( aCksum[0]!=h1.aCksum[0] || aCksum[1]!=h1.aCksum[1] ){
  1637. return 1; /* Checksum does not match */
  1638. }
  1639. if( memcmp(&pWal.hdr, &h1, sizeof(WalIndexHdr)) ){
  1640. *pChanged = 1;
  1641. memcpy(&pWal.hdr, &h1, sizeof(WalIndexHdr));
  1642. pWal.szPage = pWal.hdr.szPage;
  1643. }
  1644. /* The header was successfully read. Return zero. */
  1645. return 0;
  1646. }
  1647. /*
  1648. ** Read the wal-index header from the wal-index and into pWal.hdr.
  1649. ** If the wal-header appears to be corrupt, try to reconstruct the
  1650. ** wal-index from the WAL before returning.
  1651. **
  1652. ** Set *pChanged to 1 if the wal-index header value in pWal.hdr is
  1653. ** changed by this opertion. If pWal.hdr is unchanged, set *pChanged
  1654. ** to 0.
  1655. **
  1656. ** If the wal-index header is successfully read, return SQLITE_OK.
  1657. ** Otherwise an SQLite error code.
  1658. */
  1659. static int walIndexReadHdr(Wal *pWal, int *pChanged){
  1660. int rc; /* Return code */
  1661. int badHdr; /* True if a header read failed */
  1662. volatile u32 *page0; /* Chunk of wal-index containing header */
  1663. /* Ensure that page 0 of the wal-index (the page that contains the
  1664. ** wal-index header) is mapped. Return early if an error occurs here.
  1665. */
  1666. Debug.Assert( pChanged );
  1667. rc = walIndexPage(pWal, 0, &page0);
  1668. if( rc!=SQLITE_OK ){
  1669. return rc;
  1670. };
  1671. Debug.Assert( page0 || pWal.writeLock==0 );
  1672. /* If the first page of the wal-index has been mapped, try to read the
  1673. ** wal-index header immediately, without holding any lock. This usually
  1674. ** works, but may fail if the wal-index header is corrupt or currently
  1675. ** being modified by another thread or process.
  1676. */
  1677. badHdr = (page0 ? walIndexTryHdr(pWal, pChanged) : 1);
  1678. /* If the first attempt failed, it might have been due to a race
  1679. ** with a writer. So get a WRITE lock and try again.
  1680. */
  1681. Debug.Assert( badHdr==0 || pWal.writeLock==0 );
  1682. if( badHdr && SQLITE_OK==(rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1)) ){
  1683. pWal.writeLock = 1;
  1684. if( SQLITE_OK==(rc = walIndexPage(pWal, 0, &page0)) ){
  1685. badHdr = walIndexTryHdr(pWal, pChanged);
  1686. if( badHdr ){
  1687. /* If the wal-index header is still malformed even while holding
  1688. ** a WRITE lock, it can only mean that the header is corrupted and
  1689. ** needs to be reconstructed. So run recovery to do exactly that.
  1690. */
  1691. rc = walIndexRecover(pWal);
  1692. *pChanged = 1;
  1693. }
  1694. }
  1695. pWal.writeLock = 0;
  1696. walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
  1697. }
  1698. /* If the header is read successfully, check the version number to make
  1699. ** sure the wal-index was not constructed with some future format that
  1700. ** this version of SQLite cannot understand.
  1701. */
  1702. if( badHdr==0 && pWal.hdr.iVersion!=WALINDEX_MAX_VERSION ){
  1703. rc = SQLITE_CANTOPEN_BKPT;
  1704. }
  1705. return rc;
  1706. }
  1707. /*
  1708. ** This is the value that walTryBeginRead returns when it needs to
  1709. ** be retried.
  1710. */
  1711. //#define WAL_RETRY (-1)
  1712. const int WAL_RETRY =(-1);
  1713. /*
  1714. ** Attempt to start a read transaction. This might fail due to a race or
  1715. ** other transient condition. When that happens, it returns WAL_RETRY to
  1716. ** indicate to the caller that it is safe to retry immediately.
  1717. **
  1718. ** On success return SQLITE_OK. On a permanent failure (such an
  1719. ** I/O error or an SQLITE_BUSY because another process is running
  1720. ** recovery) return a positive error code.
  1721. **
  1722. ** The useWal parameter is true to force the use of the WAL and disable
  1723. ** the case where the WAL is bypassed because it has been completely
  1724. ** checkpointed. If useWal==0 then this routine calls walIndexReadHdr()
  1725. ** to make a copy of the wal-index header into pWal.hdr. If the
  1726. ** wal-index header has changed, *pChanged is set to 1 (as an indication
  1727. ** to the caller that the local paget cache is obsolete and needs to be
  1728. ** flushed.) When useWal==1, the wal-index header is assumed to already
  1729. ** be loaded and the pChanged parameter is unused.
  1730. **
  1731. ** The caller must set the cnt parameter to the number of prior calls to
  1732. ** this routine during the current read attempt that returned WAL_RETRY.
  1733. ** This routine will start taking more aggressive measures to clear the
  1734. ** race conditions after multiple WAL_RETRY returns, and after an excessive
  1735. ** number of errors will ultimately return SQLITE_PROTOCOL. The
  1736. ** SQLITE_PROTOCOL return indicates that some other process has gone rogue
  1737. ** and is not honoring the locking protocol. There is a vanishingly small
  1738. ** chance that SQLITE_PROTOCOL could be returned because of a run of really
  1739. ** bad luck when there is lots of contention for the wal-index, but that
  1740. ** possibility is so small that it can be safely neglected, we believe.
  1741. **
  1742. ** On success, this routine obtains a read lock on
  1743. ** WAL_READ_LOCK(pWal.readLock). The pWal.readLock integer is
  1744. ** in the range 0 <= pWal.readLock < WAL_NREADER. If pWal.readLock==(-1)
  1745. ** that means the Wal does not hold any read lock. The reader must not
  1746. ** access any database page that is modified by a WAL frame up to and
  1747. ** including frame number aReadMark[pWal.readLock]. The reader will
  1748. ** use WAL frames up to and including pWal.hdr.mxFrame if pWal.readLock>0
  1749. ** Or if pWal.readLock==0, then the reader will ignore the WAL
  1750. ** completely and get all content directly from the database file.
  1751. ** If the useWal parameter is 1 then the WAL will never be ignored and
  1752. ** this routine will always set pWal.readLock>0 on success.
  1753. ** When the read transaction is completed, the caller must release the
  1754. ** lock on WAL_READ_LOCK(pWal.readLock) and set pWal.readLock to -1.
  1755. **
  1756. ** This routine uses the nBackfill and aReadMark[] fields of the header
  1757. ** to select a particular WAL_READ_LOCK() that strives to let the
  1758. ** checkpoint process do as much work as possible. This routine might
  1759. ** update values of the aReadMark[] array in the header, but if it does
  1760. ** so it takes care to hold an exclusive lock on the corresponding
  1761. ** WAL_READ_LOCK() while changing values.
  1762. */
  1763. static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){
  1764. volatile WalCkptInfo *pInfo; /* Checkpoint information in wal-index */
  1765. u32 mxReadMark; /* Largest aReadMark[] value */
  1766. int mxI; /* Index of largest aReadMark[] value */
  1767. int i; /* Loop counter */
  1768. int rc = SQLITE_OK; /* Return code */
  1769. Debug.Assert( pWal.readLock<0 ); /* Not currently locked */
  1770. /* Take steps to avoid spinning forever if there is a protocol error. */
  1771. if( cnt>5 ){
  1772. if( cnt>100 ) return SQLITE_PROTOCOL;
  1773. sqlite3OsSleep(pWal.pVfs, 1);
  1774. }
  1775. if( !useWal ){
  1776. rc = walIndexReadHdr(pWal, pChanged);
  1777. if( rc==SQLITE_BUSY ){
  1778. /* If there is not a recovery running in another thread or process
  1779. ** then convert BUSY errors to WAL_RETRY. If recovery is known to
  1780. ** be running, convert BUSY to BUSY_RECOVERY. There is a race here
  1781. ** which might cause WAL_RETRY to be returned even if BUSY_RECOVERY
  1782. ** would be technically correct. But the race is benign since with
  1783. ** WAL_RETRY this routine will be called again and will probably be
  1784. ** right on the second iteration.
  1785. */
  1786. if( pWal.apWiData[0]==0 ){
  1787. /* This branch is taken when the xShmMap() method returns SQLITE_BUSY.
  1788. ** We assume this is a transient condition, so return WAL_RETRY. The
  1789. ** xShmMap() implementation used by the default unix and win32 VFS
  1790. ** modules may return SQLITE_BUSY due to a race condition in the
  1791. ** code that determines whether or not the shared-memory region
  1792. ** must be zeroed before the requested page is returned.
  1793. */
  1794. rc = WAL_RETRY;
  1795. }else if( SQLITE_OK==(rc = walLockShared(pWal, WAL_RECOVER_LOCK)) ){
  1796. walUnlockShared(pWal, WAL_RECOVER_LOCK);
  1797. rc = WAL_RETRY;
  1798. }else if( rc==SQLITE_BUSY ){
  1799. rc = SQLITE_BUSY_RECOVERY;
  1800. }
  1801. }
  1802. if( rc!=SQLITE_OK ){
  1803. return rc;
  1804. }
  1805. }
  1806. pInfo = walCkptInfo(pWal);
  1807. if( !useWal && pInfo.nBackfill==pWal.hdr.mxFrame ){
  1808. /* The WAL has been completely backfilled (or it is empty).
  1809. ** and can be safely ignored.
  1810. */
  1811. rc = walLockShared(pWal, WAL_READ_LOCK(0));
  1812. walShmBarrier(pWal);
  1813. if( rc==SQLITE_OK ){
  1814. if( memcmp((void *)walIndexHdr(pWal), &pWal.hdr, sizeof(WalIndexHdr)) ){
  1815. /* It is not safe to allow the reader to continue here if frames
  1816. ** may have been appended to the log before READ_LOCK(0) was obtained.
  1817. ** When holding READ_LOCK(0), the reader ignores the entire log file,
  1818. ** which implies that the database file contains a trustworthy
  1819. ** snapshoT. Since holding READ_LOCK(0) prevents a checkpoint from
  1820. ** happening, this is usually correct.
  1821. **
  1822. ** However, if frames have been appended to the log (or if the log
  1823. ** is wrapped and written for that matter) before the READ_LOCK(0)
  1824. ** is obtained, that is not necessarily true. A checkpointer may
  1825. ** have started to backfill the appended frames but crashed before
  1826. ** it finished. Leaving a corrupt image in the database file.
  1827. */
  1828. walUnlockShared(pWal, WAL_READ_LOCK(0));
  1829. return WAL_RETRY;
  1830. }
  1831. pWal.readLock = 0;
  1832. return SQLITE_OK;
  1833. }else if( rc!=SQLITE_BUSY ){
  1834. return rc;
  1835. }
  1836. }
  1837. /* If we get this far, it means that the reader will want to use
  1838. ** the WAL to get at content from recent commits. The job now is
  1839. ** to select one of the aReadMark[] entries that is closest to
  1840. ** but not exceeding pWal.hdr.mxFrame and lock that entry.
  1841. */
  1842. mxReadMark = 0;
  1843. mxI = 0;
  1844. for(i=1; i<WAL_NREADER; i++){
  1845. u32 thisMark = pInfo.aReadMark[i];
  1846. if( mxReadMark<=thisMark && thisMark<=pWal.hdr.mxFrame ){
  1847. Debug.Assert( thisMark!=READMARK_NOT_USED );
  1848. mxReadMark = thisMark;
  1849. mxI = i;
  1850. }
  1851. }
  1852. if( mxI==0 ){
  1853. /* If we get here, it means that all of the aReadMark[] entries between
  1854. ** 1 and WAL_NREADER-1 are zero. Try to initialize aReadMark[1] to
  1855. ** be mxFrame, then retry.
  1856. */
  1857. rc = walLockExclusive(pWal, WAL_READ_LOCK(1), 1);
  1858. if( rc==SQLITE_OK ){
  1859. pInfo.aReadMark[1] = pWal.hdr.mxFrame;
  1860. walUnlockExclusive(pWal, WAL_READ_LOCK(1), 1);
  1861. rc = WAL_RETRY;
  1862. }else if( rc==SQLITE_BUSY ){
  1863. rc = WAL_RETRY;
  1864. }
  1865. return rc;
  1866. }else{
  1867. if( mxReadMark < pWal.hdr.mxFrame ){
  1868. for(i=1; i<WAL_NREADER; i++){
  1869. rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
  1870. if( rc==SQLITE_OK ){
  1871. mxReadMark = pInfo.aReadMark[i] = pWal.hdr.mxFrame;
  1872. mxI = i;
  1873. walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
  1874. break;
  1875. }else if( rc!=SQLITE_BUSY ){
  1876. return rc;
  1877. }
  1878. }
  1879. }
  1880. rc = walLockShared(pWal, WAL_READ_LOCK(mxI));
  1881. if( rc ){
  1882. return rc==SQLITE_BUSY ? WAL_RETRY : rc;
  1883. }
  1884. /* Now that the read-lock has been obtained, check that neither the
  1885. ** value in the aReadMark[] array or the contents of the wal-index
  1886. ** header have changed.
  1887. **
  1888. ** It is necessary to check that the wal-index header did not change
  1889. ** between the time it was read and when the shared-lock was obtained
  1890. ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility
  1891. ** that the log file may have been wrapped by a writer, or that frames
  1892. ** that occur later in the log than pWal.hdr.mxFrame may have been
  1893. ** copied into the database by a checkpointer. If either of these things
  1894. ** happened, then reading the database with the current value of
  1895. ** pWal.hdr.mxFrame risks reading a corrupted snapshot. So, retry
  1896. ** instead.
  1897. **
  1898. ** This does not guarantee that the copy of the wal-index header is up to
  1899. ** date before proceeding. That would not be possible without somehow
  1900. ** blocking writers. It only guarantees that a dangerous checkpoint or
  1901. ** log-wrap (either of which would require an exclusive lock on
  1902. ** WAL_READ_LOCK(mxI)) has not occurred since the snapshot was valid.
  1903. */
  1904. walShmBarrier(pWal);
  1905. if( pInfo.aReadMark[mxI]!=mxReadMark
  1906. || memcmp((void *)walIndexHdr(pWal), &pWal.hdr, sizeof(WalIndexHdr))
  1907. ){
  1908. walUnlockShared(pWal, WAL_READ_LOCK(mxI));
  1909. return WAL_RETRY;
  1910. }else{
  1911. Debug.Assert( mxReadMark<=pWal.hdr.mxFrame );
  1912. pWal.readLock = (i16)mxI;
  1913. }
  1914. }
  1915. return rc;
  1916. }
  1917. /*
  1918. ** Begin a read transaction on the database.
  1919. **
  1920. ** This routine used to be called sqlite3OpenSnapshot() and with good reason:
  1921. ** it takes a snapshot of the state of the WAL and wal-index for the current
  1922. ** instant in time. The current thread will continue to use this snapshot.
  1923. ** Other threads might append new content to the WAL and wal-index but
  1924. ** that extra content is ignored by the current thread.
  1925. **
  1926. ** If the database contents have changes since the previous read
  1927. ** transaction, then *pChanged is set to 1 before returning. The
  1928. ** Pager layer will use this to know that is cache is stale and
  1929. ** needs to be flushed.
  1930. */
  1931. int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){
  1932. int rc; /* Return code */
  1933. int cnt = 0; /* Number of TryBeginRead attempts */
  1934. do{
  1935. rc = walTryBeginRead(pWal, pChanged, 0, ++cnt);
  1936. }while( rc==WAL_RETRY );
  1937. return rc;
  1938. }
  1939. /*
  1940. ** Finish with a read transaction. All this does is release the
  1941. ** read-lock.
  1942. */
  1943. void sqlite3WalEndReadTransaction(Wal *pWal){
  1944. if( pWal.readLock>=0 ){
  1945. walUnlockShared(pWal, WAL_READ_LOCK(pWal.readLock));
  1946. pWal.readLock = -1;
  1947. }
  1948. }
  1949. /*
  1950. ** Read a page from the WAL, if it is present in the WAL and if the
  1951. ** current read transaction is configured to use the WAL.
  1952. **
  1953. ** The *pInWal is set to 1 if the requested page is in the WAL and
  1954. ** has been loaded. Or *pInWal is set to 0 if the page was not in
  1955. ** the WAL and needs to be read out of the database.
  1956. */
  1957. int sqlite3WalRead(
  1958. Wal *pWal, /* WAL handle */
  1959. Pgno pgno, /* Database page number to read data for */
  1960. int *pInWal, /* OUT: True if data is read from WAL */
  1961. int nOut, /* Size of buffer pOut in bytes */
  1962. u8 *pOut /* Buffer to write page data to */
  1963. ){
  1964. u32 iRead = 0; /* If !=0, WAL frame to return data from */
  1965. u32 iLast = pWal.hdr.mxFrame; /* Last page in WAL for this reader */
  1966. int iHash; /* Used to loop through N hash tables */
  1967. /* This routine is only be called from within a read transaction. */
  1968. Debug.Assert( pWal.readLock>=0 || pWal.lockError );
  1969. /* If the "last page" field of the wal-index header snapshot is 0, then
  1970. ** no data will be read from the wal under any circumstances. Return early
  1971. ** in this case as an optimization. Likewise, if pWal.readLock==0,
  1972. ** then the WAL is ignored by the reader so return early, as if the
  1973. ** WAL were empty.
  1974. */
  1975. if( iLast==0 || pWal.readLock==0 ){
  1976. *pInWal = 0;
  1977. return SQLITE_OK;
  1978. }
  1979. /* Search the hash table or tables for an entry matching page number
  1980. ** pgno. Each iteration of the following for() loop searches one
  1981. ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames).
  1982. **
  1983. ** This code might run concurrently to the code in walIndexAppend()
  1984. ** that adds entries to the wal-index (and possibly to this hash
  1985. ** table). This means the value just read from the hash
  1986. ** slot (aHash[iKey]) may have been added before or after the
  1987. ** current read transaction was opened. Values added after the
  1988. ** read transaction was opened may have been written incorrectly -
  1989. ** i.e. these slots may contain garbage data. However, we assume
  1990. ** that any slots written before the current read transaction was
  1991. ** opened remain unmodified.
  1992. **
  1993. ** For the reasons above, the if(...) condition featured in the inner
  1994. ** loop of the following block is more stringent that would be required
  1995. ** if we had exclusive access to the hash-table:
  1996. **
  1997. ** (aPgno[iFrame]==pgno):
  1998. ** This condition filters out normal hash-table collisions.
  1999. **
  2000. ** (iFrame<=iLast):
  2001. ** This condition filters out entries that were added to the hash
  2002. ** table after the current read-transaction had started.
  2003. */
  2004. for(iHash=walFramePage(iLast); iHash>=0 && iRead==0; iHash--){
  2005. volatile ht_slot *aHash; /* Pointer to hash table */
  2006. volatile u32 *aPgno; /* Pointer to array of page numbers */
  2007. u32 iZero; /* Frame number corresponding to aPgno[0] */
  2008. int iKey; /* Hash slot index */
  2009. int nCollide; /* Number of hash collisions remaining */
  2010. int rc; /* Error code */
  2011. rc = walHashGet(pWal, iHash, &aHash, &aPgno, &iZero);
  2012. if( rc!=SQLITE_OK ){
  2013. return rc;
  2014. }
  2015. nCollide = HASHTABLE_NSLOT;
  2016. for(iKey=walHash(pgno); aHash[iKey]; iKey=walNextHash(iKey)){
  2017. u32 iFrame = aHash[iKey] + iZero;
  2018. if( iFrame<=iLast && aPgno[aHash[iKey]]==pgno ){
  2019. Debug.Assert( iFrame>iRead );
  2020. iRead = iFrame;
  2021. }
  2022. if( (nCollide--)==0 ){
  2023. return SQLITE_CORRUPT_BKPT;
  2024. }
  2025. }
  2026. }
  2027. #if SQLITE_ENABLE_EXPENSIVE_ASSERT
  2028. /* If expensive Debug.Assert() statements are available, do a linear search
  2029. ** of the wal-index file content. Make sure the results agree with the
  2030. ** result obtained using the hash indexes above. */
  2031. {
  2032. u32 iRead2 = 0;
  2033. u32 iTest;
  2034. for(iTest=iLast; iTest>0; iTest--){
  2035. if( walFramePgno(pWal, iTest)==pgno ){
  2036. iRead2 = iTest;
  2037. break;
  2038. }
  2039. }
  2040. Debug.Assert( iRead==iRead2 );
  2041. }
  2042. #endif
  2043. /* If iRead is non-zero, then it is the log frame number that contains the
  2044. ** required page. Read and return data from the log file.
  2045. */
  2046. if( iRead ){
  2047. i64 iOffset = walFrameOffset(iRead, pWal.hdr.szPage) + WAL_FRAME_HDRSIZE;
  2048. *pInWal = 1;
  2049. /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
  2050. return sqlite3OsRead(pWal.pWalFd, pOut, nOut, iOffset);
  2051. }
  2052. *pInWal = 0;
  2053. return SQLITE_OK;
  2054. }
  2055. /*
  2056. ** Set *pPgno to the size of the database file (or zero, if unknown).
  2057. */
  2058. void sqlite3WalDbsize(Wal *pWal, Pgno *pPgno){
  2059. Debug.Assert( pWal.readLock>=0 || pWal.lockError );
  2060. *pPgno = pWal.hdr.nPage;
  2061. }
  2062. /*
  2063. ** This function starts a write transaction on the WAL.
  2064. **
  2065. ** A read transaction must have already been started by a prior call
  2066. ** to sqlite3WalBeginReadTransaction().
  2067. **
  2068. ** If another thread or process has written into the database since
  2069. ** the read transaction was started, then it is not possible for this
  2070. ** thread to write as doing so would cause a fork. So this routine
  2071. ** returns SQLITE_BUSY in that case and no write transaction is started.
  2072. **
  2073. ** There can only be a single writer active at a time.
  2074. */
  2075. int sqlite3WalBeginWriteTransaction(Wal *pWal){
  2076. int rc;
  2077. /* Cannot start a write transaction without first holding a read
  2078. ** transaction. */
  2079. Debug.Assert( pWal.readLock>=0 );
  2080. if( pWal.readOnly ){
  2081. return SQLITE_READONLY;
  2082. }
  2083. /* Only one writer allowed at a time. Get the write lock. Return
  2084. ** SQLITE_BUSY if unable.
  2085. */
  2086. rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1);
  2087. if( rc ){
  2088. return rc;
  2089. }
  2090. pWal.writeLock = 1;
  2091. /* If another connection has written to the database file since the
  2092. ** time the read transaction on this connection was started, then
  2093. ** the write is disallowed.
  2094. */
  2095. if( memcmp(&pWal.hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr))!=0 ){
  2096. walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
  2097. pWal.writeLock = 0;
  2098. rc = SQLITE_BUSY;
  2099. }
  2100. return rc;
  2101. }
  2102. /*
  2103. ** End a write transaction. The commit has already been done. This
  2104. ** routine merely releases the lock.
  2105. */
  2106. int sqlite3WalEndWriteTransaction(Wal *pWal){
  2107. if( pWal.writeLock ){
  2108. walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
  2109. pWal.writeLock = 0;
  2110. }
  2111. return SQLITE_OK;
  2112. }
  2113. /*
  2114. ** If any data has been written (but not committed) to the log file, this
  2115. ** function moves the write-pointer back to the start of the transaction.
  2116. **
  2117. ** Additionally, the callback function is invoked for each frame written
  2118. ** to the WAL since the start of the transaction. If the callback returns
  2119. ** other than SQLITE_OK, it is not invoked again and the error code is
  2120. ** returned to the caller.
  2121. **
  2122. ** Otherwise, if the callback function does not return an error, this
  2123. ** function returns SQLITE_OK.
  2124. */
  2125. int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){
  2126. int rc = SQLITE_OK;
  2127. if( pWal.writeLock ){
  2128. Pgno iMax = pWal.hdr.mxFrame;
  2129. Pgno iFrame;
  2130. /* Restore the clients cache of the wal-index header to the state it
  2131. ** was in before the client began writing to the database.
  2132. */
  2133. memcpy(&pWal.hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr));
  2134. for(iFrame=pWal.hdr.mxFrame+1;
  2135. ALWAYS(rc==SQLITE_OK) && iFrame<=iMax;
  2136. iFrame++
  2137. ){
  2138. /* This call cannot fail. Unless the page for which the page number
  2139. ** is passed as the second argument is (a) in the cache and
  2140. ** (b) has an outstanding reference, then xUndo is either a no-op
  2141. ** (if (a) is false) or simply expels the page from the cache (if (b)
  2142. ** is false).
  2143. **
  2144. ** If the upper layer is doing a rollback, it is guaranteed that there
  2145. ** are no outstanding references to any page other than page 1. And
  2146. ** page 1 is never written to the log until the transaction is
  2147. ** committed. As a result, the call to xUndo may not fail.
  2148. */
  2149. Debug.Assert( walFramePgno(pWal, iFrame)!=1 );
  2150. rc = xUndo(pUndoCtx, walFramePgno(pWal, iFrame));
  2151. }
  2152. walCleanupHash(pWal);
  2153. }
  2154. Debug.Assert( rc==SQLITE_OK );
  2155. return rc;
  2156. }
  2157. /*
  2158. ** Argument aWalData must point to an array of WAL_SAVEPOINT_NDATA u32
  2159. ** values. This function populates the array with values required to
  2160. ** "rollback" the write position of the WAL handle back to the current
  2161. ** point in the event of a savepoint rollback (via WalSavepointUndo()).
  2162. */
  2163. void sqlite3WalSavepoint(Wal *pWal, u32 *aWalData){
  2164. Debug.Assert( pWal.writeLock );
  2165. aWalData[0] = pWal.hdr.mxFrame;
  2166. aWalData[1] = pWal.hdr.aFrameCksum[0];
  2167. aWalData[2] = pWal.hdr.aFrameCksum[1];
  2168. aWalData[3] = pWal.nCkpt;
  2169. }
  2170. /*
  2171. ** Move the write position of the WAL back to the point identified by
  2172. ** the values in the aWalData[] array. aWalData must point to an array
  2173. ** of WAL_SAVEPOINT_NDATA u32 values that has been previously populated
  2174. ** by a call to WalSavepoint().
  2175. */
  2176. int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){
  2177. int rc = SQLITE_OK;
  2178. Debug.Assert( pWal.writeLock );
  2179. Debug.Assert( aWalData[3]!=pWal.nCkpt || aWalData[0]<=pWal.hdr.mxFrame );
  2180. if( aWalData[3]!=pWal.nCkpt ){
  2181. /* This savepoint was opened immediately after the write-transaction
  2182. ** was started. Right after that, the writer decided to wrap around
  2183. ** to the start of the log. Update the savepoint values to match.
  2184. */
  2185. aWalData[0] = 0;
  2186. aWalData[3] = pWal.nCkpt;
  2187. }
  2188. if( aWalData[0]<pWal.hdr.mxFrame ){
  2189. pWal.hdr.mxFrame = aWalData[0];
  2190. pWal.hdr.aFrameCksum[0] = aWalData[1];
  2191. pWal.hdr.aFrameCksum[1] = aWalData[2];
  2192. walCleanupHash(pWal);
  2193. }
  2194. return rc;
  2195. }
  2196. /*
  2197. ** This function is called just before writing a set of frames to the log
  2198. ** file (see sqlite3WalFrames()). It checks to see if, instead of appending
  2199. ** to the current log file, it is possible to overwrite the start of the
  2200. ** existing log file with the new frames (i.e. "reset" the log). If so,
  2201. ** it sets pWal.hdr.mxFrame to 0. Otherwise, pWal.hdr.mxFrame is left
  2202. ** unchanged.
  2203. **
  2204. ** SQLITE_OK is returned if no error is encountered (regardless of whether
  2205. ** or not pWal.hdr.mxFrame is modified). An SQLite error code is returned
  2206. ** if an error occurs.
  2207. */
  2208. static int walRestartLog(Wal *pWal){
  2209. int rc = SQLITE_OK;
  2210. int cnt;
  2211. if( pWal.readLock==0 ){
  2212. volatile WalCkptInfo *pInfo = walCkptInfo(pWal);
  2213. Debug.Assert( pInfo.nBackfill==pWal.hdr.mxFrame );
  2214. if( pInfo.nBackfill>0 ){
  2215. rc = walLockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
  2216. if( rc==SQLITE_OK ){
  2217. /* If all readers are using WAL_READ_LOCK(0) (in other words if no
  2218. ** readers are currently using the WAL), then the transactions
  2219. ** frames will overwrite the start of the existing log. Update the
  2220. ** wal-index header to reflect this.
  2221. **
  2222. ** In theory it would be Ok to update the cache of the header only
  2223. ** at this point. But updating the actual wal-index header is also
  2224. ** safe and means there is no special case for sqlite3WalUndo()
  2225. ** to handle if this transaction is rolled back.
  2226. */
  2227. int i; /* Loop counter */
  2228. u32 *aSalt = pWal.hdr.aSalt; /* Big-endian salt values */
  2229. pWal.nCkpt++;
  2230. pWal.hdr.mxFrame = 0;
  2231. sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0]));
  2232. sqlite3_randomness(4, &aSalt[1]);
  2233. walIndexWriteHdr(pWal);
  2234. pInfo.nBackfill = 0;
  2235. for(i=1; i<WAL_NREADER; i++) pInfo.aReadMark[i] = READMARK_NOT_USED;
  2236. Debug.Assert( pInfo.aReadMark[0]==0 );
  2237. walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
  2238. }else if( rc!=SQLITE_BUSY ){
  2239. return rc;
  2240. }
  2241. }
  2242. walUnlockShared(pWal, WAL_READ_LOCK(0));
  2243. pWal.readLock = -1;
  2244. cnt = 0;
  2245. do{
  2246. int notUsed;
  2247. rc = walTryBeginRead(pWal, &notUsed, 1, ++cnt);
  2248. }while( rc==WAL_RETRY );
  2249. }
  2250. return rc;
  2251. }
  2252. /*
  2253. ** Write a set of frames to the log. The caller must hold the write-lock
  2254. ** on the log file (obtained using sqlite3WalBeginWriteTransaction()).
  2255. */
  2256. int sqlite3WalFrames(
  2257. Wal *pWal, /* Wal handle to write to */
  2258. int szPage, /* Database page-size in bytes */
  2259. PgHdr *pList, /* List of dirty pages to write */
  2260. Pgno nTruncate, /* Database size after this commit */
  2261. int isCommit, /* True if this is a commit */
  2262. int sync_flags /* Flags to pass to OsSync() (or 0) */
  2263. ){
  2264. int rc; /* Used to catch return codes */
  2265. u32 iFrame; /* Next frame address */
  2266. u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
  2267. PgHdr *p; /* Iterator to run through pList with. */
  2268. PgHdr *pLast = 0; /* Last frame in list */
  2269. int nLast = 0; /* Number of extra copies of last page */
  2270. Debug.Assert( pList );
  2271. Debug.Assert( pWal.writeLock );
  2272. #if (SQLITE_TEST) && (SQLITE_DEBUG)
  2273. { int cnt; for(cnt=0, p=pList; p; p=p.pDirty, cnt++){}
  2274. WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n",
  2275. pWal, cnt, pWal.hdr.mxFrame, isCommit ? "Commit" : "Spill"));
  2276. }
  2277. #endif
  2278. /* See if it is possible to write these frames into the start of the
  2279. ** log file, instead of appending to it at pWal.hdr.mxFrame.
  2280. */
  2281. if( SQLITE_OK!=(rc = walRestartLog(pWal)) ){
  2282. return rc;
  2283. }
  2284. /* If this is the first frame written into the log, write the WAL
  2285. ** header to the start of the WAL file. See comments at the top of
  2286. ** this source file for a description of the WAL header format.
  2287. */
  2288. iFrame = pWal.hdr.mxFrame;
  2289. if( iFrame==0 ){
  2290. u8 aWalHdr[WAL_HDRSIZE]; /* Buffer to assemble wal-header in */
  2291. u32 aCksum[2]; /* Checksum for wal-header */
  2292. sqlite3Put4byte(&aWalHdr[0], (WAL_MAGIC | SQLITE_BIGENDIAN));
  2293. sqlite3Put4byte(&aWalHdr[4], WAL_MAX_VERSION);
  2294. sqlite3Put4byte(&aWalHdr[8], szPage);
  2295. sqlite3Put4byte(&aWalHdr[12], pWal.nCkpt);
  2296. sqlite3_randomness(8, pWal.hdr.aSalt);
  2297. memcpy(&aWalHdr[16], pWal.hdr.aSalt, 8);
  2298. walChecksumBytes(1, aWalHdr, WAL_HDRSIZE-2*4, 0, aCksum);
  2299. sqlite3Put4byte(&aWalHdr[24], aCksum[0]);
  2300. sqlite3Put4byte(&aWalHdr[28], aCksum[1]);
  2301. pWal.szPage = (u16)szPage;
  2302. pWal.hdr.bigEndCksum = SQLITE_BIGENDIAN;
  2303. pWal.hdr.aFrameCksum[0] = aCksum[0];
  2304. pWal.hdr.aFrameCksum[1] = aCksum[1];
  2305. rc = sqlite3OsWrite(pWal.pWalFd, aWalHdr, sizeof(aWalHdr), 0);
  2306. WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok"));
  2307. if( rc!=SQLITE_OK ){
  2308. return rc;
  2309. }
  2310. }
  2311. Debug.Assert( (int)pWal.szPage==szPage );
  2312. /* Write the log file. */
  2313. for(p=pList; p; p=p.pDirty){
  2314. u32 nDbsize; /* Db-size field for frame header */
  2315. i64 iOffset; /* Write offset in log file */
  2316. void *pData;
  2317. iOffset = walFrameOffset(++iFrame, szPage);
  2318. /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
  2319. /* Populate and write the frame header */
  2320. nDbsize = (isCommit && p.pDirty==0) ? nTruncate : 0;
  2321. #if (SQLITE_HAS_CODEC)
  2322. if( (pData = sqlite3PagerCodec(p))==0 ) return SQLITE_NOMEM;
  2323. #else
  2324. pData = p.pData;
  2325. #endif
  2326. walEncodeFrame(pWal, p.pgno, nDbsize, pData, aFrame);
  2327. rc = sqlite3OsWrite(pWal.pWalFd, aFrame, sizeof(aFrame), iOffset);
  2328. if( rc!=SQLITE_OK ){
  2329. return rc;
  2330. }
  2331. /* Write the page data */
  2332. rc = sqlite3OsWrite(pWal.pWalFd, pData, szPage, iOffset+sizeof(aFrame));
  2333. if( rc!=SQLITE_OK ){
  2334. return rc;
  2335. }
  2336. pLast = p;
  2337. }
  2338. /* Sync the log file if the 'isSync' flag was specified. */
  2339. if( sync_flags ){
  2340. i64 iSegment = sqlite3OsSectorSize(pWal.pWalFd);
  2341. i64 iOffset = walFrameOffset(iFrame+1, szPage);
  2342. Debug.Assert( isCommit );
  2343. Debug.Assert( iSegment>0 );
  2344. iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
  2345. while( iOffset<iSegment ){
  2346. void *pData;
  2347. #if (SQLITE_HAS_CODEC)
  2348. if( (pData = sqlite3PagerCodec(pLast))==0 ) return SQLITE_NOMEM;
  2349. #else
  2350. pData = pLast.pData;
  2351. #endif
  2352. walEncodeFrame(pWal, pLast.pgno, nTruncate, pData, aFrame);
  2353. /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
  2354. rc = sqlite3OsWrite(pWal.pWalFd, aFrame, sizeof(aFrame), iOffset);
  2355. if( rc!=SQLITE_OK ){
  2356. return rc;
  2357. }
  2358. iOffset += WAL_FRAME_HDRSIZE;
  2359. rc = sqlite3OsWrite(pWal.pWalFd, pData, szPage, iOffset);
  2360. if( rc!=SQLITE_OK ){
  2361. return rc;
  2362. }
  2363. nLast++;
  2364. iOffset += szPage;
  2365. }
  2366. rc = sqlite3OsSync(pWal.pWalFd, sync_flags);
  2367. }
  2368. /* Append data to the wal-index. It is not necessary to lock the
  2369. ** wal-index to do this as the SQLITE_SHM_WRITE lock held on the wal-index
  2370. ** guarantees that there are no other writers, and no data that may
  2371. ** be in use by existing readers is being overwritten.
  2372. */
  2373. iFrame = pWal.hdr.mxFrame;
  2374. for(p=pList; p && rc==SQLITE_OK; p=p.pDirty){
  2375. iFrame++;
  2376. rc = walIndexAppend(pWal, iFrame, p.pgno);
  2377. }
  2378. while( nLast>0 && rc==SQLITE_OK ){
  2379. iFrame++;
  2380. nLast--;
  2381. rc = walIndexAppend(pWal, iFrame, pLast.pgno);
  2382. }
  2383. if( rc==SQLITE_OK ){
  2384. /* Update the private copy of the header. */
  2385. pWal.hdr.szPage = (u16)szPage;
  2386. pWal.hdr.mxFrame = iFrame;
  2387. if( isCommit ){
  2388. pWal.hdr.iChange++;
  2389. pWal.hdr.nPage = nTruncate;
  2390. }
  2391. /* If this is a commit, update the wal-index header too. */
  2392. if( isCommit ){
  2393. walIndexWriteHdr(pWal);
  2394. pWal.iCallback = iFrame;
  2395. }
  2396. }
  2397. WALTRACE(("WAL%p: frame write %s\n", pWal, rc ? "failed" : "ok"));
  2398. return rc;
  2399. }
  2400. /*
  2401. ** This routine is called to implement sqlite3_wal_checkpoint() and
  2402. ** related interfaces.
  2403. **
  2404. ** Obtain a CHECKPOINT lock and then backfill as much information as
  2405. ** we can from WAL into the database.
  2406. */
  2407. int sqlite3WalCheckpoint(
  2408. Wal *pWal, /* Wal connection */
  2409. int sync_flags, /* Flags to sync db file with (or 0) */
  2410. int nBuf, /* Size of temporary buffer */
  2411. u8 *zBuf /* Temporary buffer to use */
  2412. ){
  2413. int rc; /* Return code */
  2414. int isChanged = 0; /* True if a new wal-index header is loaded */
  2415. Debug.Assert( pWal.ckptLock==0 );
  2416. WALTRACE(("WAL%p: checkpoint begins\n", pWal));
  2417. rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1);
  2418. if( rc ){
  2419. /* Usually this is SQLITE_BUSY meaning that another thread or process
  2420. ** is already running a checkpoint, or maybe a recovery. But it might
  2421. ** also be SQLITE_IOERR. */
  2422. return rc;
  2423. }
  2424. pWal.ckptLock = 1;
  2425. /* Copy data from the log to the database file. */
  2426. rc = walIndexReadHdr(pWal, &isChanged);
  2427. if( rc==SQLITE_OK ){
  2428. rc = walCheckpoint(pWal, sync_flags, nBuf, zBuf);
  2429. }
  2430. if( isChanged ){
  2431. /* If a new wal-index header was loaded before the checkpoint was
  2432. ** performed, then the pager-cache associated with pWal is now
  2433. ** out of date. So zero the cached wal-index header to ensure that
  2434. ** next time the pager opens a snapshot on this database it knows that
  2435. ** the cache needs to be reset.
  2436. */
  2437. memset(&pWal.hdr, 0, sizeof(WalIndexHdr));
  2438. }
  2439. /* Release the locks. */
  2440. walUnlockExclusive(pWal, WAL_CKPT_LOCK, 1);
  2441. pWal.ckptLock = 0;
  2442. WALTRACE(("WAL%p: checkpoint %s\n", pWal, rc ? "failed" : "ok"));
  2443. return rc;
  2444. }
  2445. /* Return the value to pass to a sqlite3_wal_hook callback, the
  2446. ** number of frames in the WAL at the point of the last commit since
  2447. ** sqlite3WalCallback() was called. If no commits have occurred since
  2448. ** the last call, then return 0.
  2449. */
  2450. int sqlite3WalCallback(Wal *pWal){
  2451. u32 ret = 0;
  2452. if( pWal ){
  2453. ret = pWal.iCallback;
  2454. pWal.iCallback = 0;
  2455. }
  2456. return (int)ret;
  2457. }
  2458. /*
  2459. ** This function is called to change the WAL subsystem into or out
  2460. ** of locking_mode=EXCLUSIVE.
  2461. **
  2462. ** If op is zero, then attempt to change from locking_mode=EXCLUSIVE
  2463. ** into locking_mode=NORMAL. This means that we must acquire a lock
  2464. ** on the pWal.readLock byte. If the WAL is already in locking_mode=NORMAL
  2465. ** or if the acquisition of the lock fails, then return 0. If the
  2466. ** transition out of exclusive-mode is successful, return 1. This
  2467. ** operation must occur while the pager is still holding the exclusive
  2468. ** lock on the main database file.
  2469. **
  2470. ** If op is one, then change from locking_mode=NORMAL into
  2471. ** locking_mode=EXCLUSIVE. This means that the pWal.readLock must
  2472. ** be released. Return 1 if the transition is made and 0 if the
  2473. ** WAL is already in exclusive-locking mode - meaning that this
  2474. ** routine is a no-op. The pager must already hold the exclusive lock
  2475. ** on the main database file before invoking this operation.
  2476. **
  2477. ** If op is negative, then do a dry-run of the op==1 case but do
  2478. ** not actually change anything. The pager uses this to see if it
  2479. ** should acquire the database exclusive lock prior to invoking
  2480. ** the op==1 case.
  2481. */
  2482. int sqlite3WalExclusiveMode(Wal *pWal, int op){
  2483. int rc;
  2484. Debug.Assert( pWal.writeLock==0 );
  2485. Debug.Assert( pWal.exclusiveMode!=WAL_HEAPMEMORY_MODE || op==-1 );
  2486. /* pWal.readLock is usually set, but might be -1 if there was a
  2487. ** prior error while attempting to acquire are read-lock. This cannot
  2488. ** happen if the connection is actually in exclusive mode (as no xShmLock
  2489. ** locks are taken in this case). Nor should the pager attempt to
  2490. ** upgrade to exclusive-mode following such an error.
  2491. */
  2492. Debug.Assert( pWal.readLock>=0 || pWal.lockError );
  2493. Debug.Assert( pWal.readLock>=0 || (op<=0 && pWal.exclusiveMode==0) );
  2494. if( op==0 ){
  2495. if( pWal.exclusiveMode ){
  2496. pWal.exclusiveMode = 0;
  2497. if( walLockShared(pWal, WAL_READ_LOCK(pWal.readLock))!=SQLITE_OK ){
  2498. pWal.exclusiveMode = 1;
  2499. }
  2500. rc = pWal.exclusiveMode==0;
  2501. }else{
  2502. /* Already in locking_mode=NORMAL */
  2503. rc = 0;
  2504. }
  2505. }else if( op>0 ){
  2506. Debug.Assert( pWal.exclusiveMode==0 );
  2507. Debug.Assert( pWal.readLock>=0 );
  2508. walUnlockShared(pWal, WAL_READ_LOCK(pWal.readLock));
  2509. pWal.exclusiveMode = 1;
  2510. rc = 1;
  2511. }else{
  2512. rc = pWal.exclusiveMode==0;
  2513. }
  2514. return rc;
  2515. }
  2516. /*
  2517. ** Return true if the argument is non-NULL and the WAL module is using
  2518. ** heap-memory for the wal-index. Otherwise, if the argument is NULL or the
  2519. ** WAL module is using shared-memory, return false.
  2520. */
  2521. int sqlite3WalHeapMemory(Wal *pWal){
  2522. return (pWal && pWal.exclusiveMode==WAL_HEAPMEMORY_MODE );
  2523. }
  2524. #endif //* #if !SQLITE_OMIT_WAL */
  2525. }
  2526. }