000001 /* 000002 ** 2004 April 6 000003 ** 000004 ** The author disclaims copyright to this source code. In place of 000005 ** a legal notice, here is a blessing: 000006 ** 000007 ** May you do good and not evil. 000008 ** May you find forgiveness for yourself and forgive others. 000009 ** May you share freely, never taking more than you give. 000010 ** 000011 ************************************************************************* 000012 ** This file implements an external (disk-based) database using BTrees. 000013 ** See the header comment on "btreeInt.h" for additional information. 000014 ** Including a description of file format and an overview of operation. 000015 */ 000016 #include "btreeInt.h" 000017 000018 /* 000019 ** The header string that appears at the beginning of every 000020 ** SQLite database. 000021 */ 000022 static const char zMagicHeader[] = SQLITE_FILE_HEADER; 000023 000024 /* 000025 ** Set this global variable to 1 to enable tracing using the TRACE 000026 ** macro. 000027 */ 000028 #if 0 000029 int sqlite3BtreeTrace=1; /* True to enable tracing */ 000030 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);} 000031 #else 000032 # define TRACE(X) 000033 #endif 000034 000035 /* 000036 ** Extract a 2-byte big-endian integer from an array of unsigned bytes. 000037 ** But if the value is zero, make it 65536. 000038 ** 000039 ** This routine is used to extract the "offset to cell content area" value 000040 ** from the header of a btree page. If the page size is 65536 and the page 000041 ** is empty, the offset should be 65536, but the 2-byte value stores zero. 000042 ** This routine makes the necessary adjustment to 65536. 000043 */ 000044 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1) 000045 000046 /* 000047 ** Values passed as the 5th argument to allocateBtreePage() 000048 */ 000049 #define BTALLOC_ANY 0 /* Allocate any page */ 000050 #define BTALLOC_EXACT 1 /* Allocate exact page if possible */ 000051 #define BTALLOC_LE 2 /* Allocate any page <= the parameter */ 000052 000053 /* 000054 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not 000055 ** defined, or 0 if it is. For example: 000056 ** 000057 ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum); 000058 */ 000059 #ifndef SQLITE_OMIT_AUTOVACUUM 000060 #define IfNotOmitAV(expr) (expr) 000061 #else 000062 #define IfNotOmitAV(expr) 0 000063 #endif 000064 000065 #ifndef SQLITE_OMIT_SHARED_CACHE 000066 /* 000067 ** A list of BtShared objects that are eligible for participation 000068 ** in shared cache. This variable has file scope during normal builds, 000069 ** but the test harness needs to access it so we make it global for 000070 ** test builds. 000071 ** 000072 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MAIN. 000073 */ 000074 #ifdef SQLITE_TEST 000075 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 000076 #else 000077 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 000078 #endif 000079 #endif /* SQLITE_OMIT_SHARED_CACHE */ 000080 000081 #ifndef SQLITE_OMIT_SHARED_CACHE 000082 /* 000083 ** Enable or disable the shared pager and schema features. 000084 ** 000085 ** This routine has no effect on existing database connections. 000086 ** The shared cache setting effects only future calls to 000087 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). 000088 */ 000089 int sqlite3_enable_shared_cache(int enable){ 000090 sqlite3GlobalConfig.sharedCacheEnabled = enable; 000091 return SQLITE_OK; 000092 } 000093 #endif 000094 000095 000096 000097 #ifdef SQLITE_OMIT_SHARED_CACHE 000098 /* 000099 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(), 000100 ** and clearAllSharedCacheTableLocks() 000101 ** manipulate entries in the BtShared.pLock linked list used to store 000102 ** shared-cache table level locks. If the library is compiled with the 000103 ** shared-cache feature disabled, then there is only ever one user 000104 ** of each BtShared structure and so this locking is not necessary. 000105 ** So define the lock related functions as no-ops. 000106 */ 000107 #define querySharedCacheTableLock(a,b,c) SQLITE_OK 000108 #define setSharedCacheTableLock(a,b,c) SQLITE_OK 000109 #define clearAllSharedCacheTableLocks(a) 000110 #define downgradeAllSharedCacheTableLocks(a) 000111 #define hasSharedCacheTableLock(a,b,c,d) 1 000112 #define hasReadConflicts(a, b) 0 000113 #endif 000114 000115 #ifdef SQLITE_DEBUG 000116 /* 000117 ** Return and reset the seek counter for a Btree object. 000118 */ 000119 sqlite3_uint64 sqlite3BtreeSeekCount(Btree *pBt){ 000120 u64 n = pBt->nSeek; 000121 pBt->nSeek = 0; 000122 return n; 000123 } 000124 #endif 000125 000126 /* 000127 ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single 000128 ** (MemPage*) as an argument. The (MemPage*) must not be NULL. 000129 ** 000130 ** If SQLITE_DEBUG is not defined, then this macro is equivalent to 000131 ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message 000132 ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented 000133 ** with the page number and filename associated with the (MemPage*). 000134 */ 000135 #ifdef SQLITE_DEBUG 000136 int corruptPageError(int lineno, MemPage *p){ 000137 char *zMsg; 000138 sqlite3BeginBenignMalloc(); 000139 zMsg = sqlite3_mprintf("database corruption page %u of %s", 000140 p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0) 000141 ); 000142 sqlite3EndBenignMalloc(); 000143 if( zMsg ){ 000144 sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg); 000145 } 000146 sqlite3_free(zMsg); 000147 return SQLITE_CORRUPT_BKPT; 000148 } 000149 # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage) 000150 #else 000151 # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno) 000152 #endif 000153 000154 /* Default value for SHARED_LOCK_TRACE macro if shared-cache is disabled 000155 ** or if the lock tracking is disabled. This is always the value for 000156 ** release builds. 000157 */ 000158 #define SHARED_LOCK_TRACE(X,MSG,TAB,TYPE) /*no-op*/ 000159 000160 #ifndef SQLITE_OMIT_SHARED_CACHE 000161 000162 #if 0 000163 /* ^---- Change to 1 and recompile to enable shared-lock tracing 000164 ** for debugging purposes. 000165 ** 000166 ** Print all shared-cache locks on a BtShared. Debugging use only. 000167 */ 000168 static void sharedLockTrace( 000169 BtShared *pBt, 000170 const char *zMsg, 000171 int iRoot, 000172 int eLockType 000173 ){ 000174 BtLock *pLock; 000175 if( iRoot>0 ){ 000176 printf("%s-%p %u%s:", zMsg, pBt, iRoot, eLockType==READ_LOCK?"R":"W"); 000177 }else{ 000178 printf("%s-%p:", zMsg, pBt); 000179 } 000180 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){ 000181 printf(" %p/%u%s", pLock->pBtree, pLock->iTable, 000182 pLock->eLock==READ_LOCK ? "R" : "W"); 000183 while( pLock->pNext && pLock->pBtree==pLock->pNext->pBtree ){ 000184 pLock = pLock->pNext; 000185 printf(",%u%s", pLock->iTable, pLock->eLock==READ_LOCK ? "R" : "W"); 000186 } 000187 } 000188 printf("\n"); 000189 fflush(stdout); 000190 } 000191 #undef SHARED_LOCK_TRACE 000192 #define SHARED_LOCK_TRACE(X,MSG,TAB,TYPE) sharedLockTrace(X,MSG,TAB,TYPE) 000193 #endif /* Shared-lock tracing */ 000194 000195 #ifdef SQLITE_DEBUG 000196 /* 000197 **** This function is only used as part of an assert() statement. *** 000198 ** 000199 ** Check to see if pBtree holds the required locks to read or write to the 000200 ** table with root page iRoot. Return 1 if it does and 0 if not. 000201 ** 000202 ** For example, when writing to a table with root-page iRoot via 000203 ** Btree connection pBtree: 000204 ** 000205 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) ); 000206 ** 000207 ** When writing to an index that resides in a sharable database, the 000208 ** caller should have first obtained a lock specifying the root page of 000209 ** the corresponding table. This makes things a bit more complicated, 000210 ** as this module treats each table as a separate structure. To determine 000211 ** the table corresponding to the index being written, this 000212 ** function has to search through the database schema. 000213 ** 000214 ** Instead of a lock on the table/index rooted at page iRoot, the caller may 000215 ** hold a write-lock on the schema table (root page 1). This is also 000216 ** acceptable. 000217 */ 000218 static int hasSharedCacheTableLock( 000219 Btree *pBtree, /* Handle that must hold lock */ 000220 Pgno iRoot, /* Root page of b-tree */ 000221 int isIndex, /* True if iRoot is the root of an index b-tree */ 000222 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */ 000223 ){ 000224 Schema *pSchema = (Schema *)pBtree->pBt->pSchema; 000225 Pgno iTab = 0; 000226 BtLock *pLock; 000227 000228 /* If this database is not shareable, or if the client is reading 000229 ** and has the read-uncommitted flag set, then no lock is required. 000230 ** Return true immediately. 000231 */ 000232 if( (pBtree->sharable==0) 000233 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit)) 000234 ){ 000235 return 1; 000236 } 000237 000238 /* If the client is reading or writing an index and the schema is 000239 ** not loaded, then it is too difficult to actually check to see if 000240 ** the correct locks are held. So do not bother - just return true. 000241 ** This case does not come up very often anyhow. 000242 */ 000243 if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){ 000244 return 1; 000245 } 000246 000247 /* Figure out the root-page that the lock should be held on. For table 000248 ** b-trees, this is just the root page of the b-tree being read or 000249 ** written. For index b-trees, it is the root page of the associated 000250 ** table. */ 000251 if( isIndex ){ 000252 HashElem *p; 000253 int bSeen = 0; 000254 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){ 000255 Index *pIdx = (Index *)sqliteHashData(p); 000256 if( pIdx->tnum==iRoot ){ 000257 if( bSeen ){ 000258 /* Two or more indexes share the same root page. There must 000259 ** be imposter tables. So just return true. The assert is not 000260 ** useful in that case. */ 000261 return 1; 000262 } 000263 iTab = pIdx->pTable->tnum; 000264 bSeen = 1; 000265 } 000266 } 000267 }else{ 000268 iTab = iRoot; 000269 } 000270 000271 SHARED_LOCK_TRACE(pBtree->pBt,"hasLock",iRoot,eLockType); 000272 000273 /* Search for the required lock. Either a write-lock on root-page iTab, a 000274 ** write-lock on the schema table, or (if the client is reading) a 000275 ** read-lock on iTab will suffice. Return 1 if any of these are found. */ 000276 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){ 000277 if( pLock->pBtree==pBtree 000278 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1)) 000279 && pLock->eLock>=eLockType 000280 ){ 000281 return 1; 000282 } 000283 } 000284 000285 /* Failed to find the required lock. */ 000286 return 0; 000287 } 000288 #endif /* SQLITE_DEBUG */ 000289 000290 #ifdef SQLITE_DEBUG 000291 /* 000292 **** This function may be used as part of assert() statements only. **** 000293 ** 000294 ** Return true if it would be illegal for pBtree to write into the 000295 ** table or index rooted at iRoot because other shared connections are 000296 ** simultaneously reading that same table or index. 000297 ** 000298 ** It is illegal for pBtree to write if some other Btree object that 000299 ** shares the same BtShared object is currently reading or writing 000300 ** the iRoot table. Except, if the other Btree object has the 000301 ** read-uncommitted flag set, then it is OK for the other object to 000302 ** have a read cursor. 000303 ** 000304 ** For example, before writing to any part of the table or index 000305 ** rooted at page iRoot, one should call: 000306 ** 000307 ** assert( !hasReadConflicts(pBtree, iRoot) ); 000308 */ 000309 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){ 000310 BtCursor *p; 000311 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 000312 if( p->pgnoRoot==iRoot 000313 && p->pBtree!=pBtree 000314 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit) 000315 ){ 000316 return 1; 000317 } 000318 } 000319 return 0; 000320 } 000321 #endif /* #ifdef SQLITE_DEBUG */ 000322 000323 /* 000324 ** Query to see if Btree handle p may obtain a lock of type eLock 000325 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return 000326 ** SQLITE_OK if the lock may be obtained (by calling 000327 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not. 000328 */ 000329 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ 000330 BtShared *pBt = p->pBt; 000331 BtLock *pIter; 000332 000333 assert( sqlite3BtreeHoldsMutex(p) ); 000334 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 000335 assert( p->db!=0 ); 000336 assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 ); 000337 000338 /* If requesting a write-lock, then the Btree must have an open write 000339 ** transaction on this file. And, obviously, for this to be so there 000340 ** must be an open write transaction on the file itself. 000341 */ 000342 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) ); 000343 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE ); 000344 000345 /* This routine is a no-op if the shared-cache is not enabled */ 000346 if( !p->sharable ){ 000347 return SQLITE_OK; 000348 } 000349 000350 /* If some other connection is holding an exclusive lock, the 000351 ** requested lock may not be obtained. 000352 */ 000353 if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){ 000354 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db); 000355 return SQLITE_LOCKED_SHAREDCACHE; 000356 } 000357 000358 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 000359 /* The condition (pIter->eLock!=eLock) in the following if(...) 000360 ** statement is a simplification of: 000361 ** 000362 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK) 000363 ** 000364 ** since we know that if eLock==WRITE_LOCK, then no other connection 000365 ** may hold a WRITE_LOCK on any table in this file (since there can 000366 ** only be a single writer). 000367 */ 000368 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK ); 000369 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK); 000370 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){ 000371 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db); 000372 if( eLock==WRITE_LOCK ){ 000373 assert( p==pBt->pWriter ); 000374 pBt->btsFlags |= BTS_PENDING; 000375 } 000376 return SQLITE_LOCKED_SHAREDCACHE; 000377 } 000378 } 000379 return SQLITE_OK; 000380 } 000381 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 000382 000383 #ifndef SQLITE_OMIT_SHARED_CACHE 000384 /* 000385 ** Add a lock on the table with root-page iTable to the shared-btree used 000386 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 000387 ** WRITE_LOCK. 000388 ** 000389 ** This function assumes the following: 000390 ** 000391 ** (a) The specified Btree object p is connected to a sharable 000392 ** database (one with the BtShared.sharable flag set), and 000393 ** 000394 ** (b) No other Btree objects hold a lock that conflicts 000395 ** with the requested lock (i.e. querySharedCacheTableLock() has 000396 ** already been called and returned SQLITE_OK). 000397 ** 000398 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 000399 ** is returned if a malloc attempt fails. 000400 */ 000401 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ 000402 BtShared *pBt = p->pBt; 000403 BtLock *pLock = 0; 000404 BtLock *pIter; 000405 000406 SHARED_LOCK_TRACE(pBt,"setLock", iTable, eLock); 000407 000408 assert( sqlite3BtreeHoldsMutex(p) ); 000409 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 000410 assert( p->db!=0 ); 000411 000412 /* A connection with the read-uncommitted flag set will never try to 000413 ** obtain a read-lock using this function. The only read-lock obtained 000414 ** by a connection in read-uncommitted mode is on the sqlite_schema 000415 ** table, and that lock is obtained in BtreeBeginTrans(). */ 000416 assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK ); 000417 000418 /* This function should only be called on a sharable b-tree after it 000419 ** has been determined that no other b-tree holds a conflicting lock. */ 000420 assert( p->sharable ); 000421 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) ); 000422 000423 /* First search the list for an existing lock on this table. */ 000424 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 000425 if( pIter->iTable==iTable && pIter->pBtree==p ){ 000426 pLock = pIter; 000427 break; 000428 } 000429 } 000430 000431 /* If the above search did not find a BtLock struct associating Btree p 000432 ** with table iTable, allocate one and link it into the list. 000433 */ 000434 if( !pLock ){ 000435 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); 000436 if( !pLock ){ 000437 return SQLITE_NOMEM_BKPT; 000438 } 000439 pLock->iTable = iTable; 000440 pLock->pBtree = p; 000441 pLock->pNext = pBt->pLock; 000442 pBt->pLock = pLock; 000443 } 000444 000445 /* Set the BtLock.eLock variable to the maximum of the current lock 000446 ** and the requested lock. This means if a write-lock was already held 000447 ** and a read-lock requested, we don't incorrectly downgrade the lock. 000448 */ 000449 assert( WRITE_LOCK>READ_LOCK ); 000450 if( eLock>pLock->eLock ){ 000451 pLock->eLock = eLock; 000452 } 000453 000454 return SQLITE_OK; 000455 } 000456 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 000457 000458 #ifndef SQLITE_OMIT_SHARED_CACHE 000459 /* 000460 ** Release all the table locks (locks obtained via calls to 000461 ** the setSharedCacheTableLock() procedure) held by Btree object p. 000462 ** 000463 ** This function assumes that Btree p has an open read or write 000464 ** transaction. If it does not, then the BTS_PENDING flag 000465 ** may be incorrectly cleared. 000466 */ 000467 static void clearAllSharedCacheTableLocks(Btree *p){ 000468 BtShared *pBt = p->pBt; 000469 BtLock **ppIter = &pBt->pLock; 000470 000471 assert( sqlite3BtreeHoldsMutex(p) ); 000472 assert( p->sharable || 0==*ppIter ); 000473 assert( p->inTrans>0 ); 000474 000475 SHARED_LOCK_TRACE(pBt, "clearAllLocks", 0, 0); 000476 000477 while( *ppIter ){ 000478 BtLock *pLock = *ppIter; 000479 assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree ); 000480 assert( pLock->pBtree->inTrans>=pLock->eLock ); 000481 if( pLock->pBtree==p ){ 000482 *ppIter = pLock->pNext; 000483 assert( pLock->iTable!=1 || pLock==&p->lock ); 000484 if( pLock->iTable!=1 ){ 000485 sqlite3_free(pLock); 000486 } 000487 }else{ 000488 ppIter = &pLock->pNext; 000489 } 000490 } 000491 000492 assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter ); 000493 if( pBt->pWriter==p ){ 000494 pBt->pWriter = 0; 000495 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING); 000496 }else if( pBt->nTransaction==2 ){ 000497 /* This function is called when Btree p is concluding its 000498 ** transaction. If there currently exists a writer, and p is not 000499 ** that writer, then the number of locks held by connections other 000500 ** than the writer must be about to drop to zero. In this case 000501 ** set the BTS_PENDING flag to 0. 000502 ** 000503 ** If there is not currently a writer, then BTS_PENDING must 000504 ** be zero already. So this next line is harmless in that case. 000505 */ 000506 pBt->btsFlags &= ~BTS_PENDING; 000507 } 000508 } 000509 000510 /* 000511 ** This function changes all write-locks held by Btree p into read-locks. 000512 */ 000513 static void downgradeAllSharedCacheTableLocks(Btree *p){ 000514 BtShared *pBt = p->pBt; 000515 000516 SHARED_LOCK_TRACE(pBt, "downgradeLocks", 0, 0); 000517 000518 if( pBt->pWriter==p ){ 000519 BtLock *pLock; 000520 pBt->pWriter = 0; 000521 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING); 000522 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){ 000523 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p ); 000524 pLock->eLock = READ_LOCK; 000525 } 000526 } 000527 } 000528 000529 #endif /* SQLITE_OMIT_SHARED_CACHE */ 000530 000531 static void releasePage(MemPage *pPage); /* Forward reference */ 000532 static void releasePageOne(MemPage *pPage); /* Forward reference */ 000533 static void releasePageNotNull(MemPage *pPage); /* Forward reference */ 000534 000535 /* 000536 ***** This routine is used inside of assert() only **** 000537 ** 000538 ** Verify that the cursor holds the mutex on its BtShared 000539 */ 000540 #ifdef SQLITE_DEBUG 000541 static int cursorHoldsMutex(BtCursor *p){ 000542 return sqlite3_mutex_held(p->pBt->mutex); 000543 } 000544 000545 /* Verify that the cursor and the BtShared agree about what is the current 000546 ** database connetion. This is important in shared-cache mode. If the database 000547 ** connection pointers get out-of-sync, it is possible for routines like 000548 ** btreeInitPage() to reference an stale connection pointer that references a 000549 ** a connection that has already closed. This routine is used inside assert() 000550 ** statements only and for the purpose of double-checking that the btree code 000551 ** does keep the database connection pointers up-to-date. 000552 */ 000553 static int cursorOwnsBtShared(BtCursor *p){ 000554 assert( cursorHoldsMutex(p) ); 000555 return (p->pBtree->db==p->pBt->db); 000556 } 000557 #endif 000558 000559 /* 000560 ** Invalidate the overflow cache of the cursor passed as the first argument. 000561 ** on the shared btree structure pBt. 000562 */ 000563 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl) 000564 000565 /* 000566 ** Invalidate the overflow page-list cache for all cursors opened 000567 ** on the shared btree structure pBt. 000568 */ 000569 static void invalidateAllOverflowCache(BtShared *pBt){ 000570 BtCursor *p; 000571 assert( sqlite3_mutex_held(pBt->mutex) ); 000572 for(p=pBt->pCursor; p; p=p->pNext){ 000573 invalidateOverflowCache(p); 000574 } 000575 } 000576 000577 #ifndef SQLITE_OMIT_INCRBLOB 000578 /* 000579 ** This function is called before modifying the contents of a table 000580 ** to invalidate any incrblob cursors that are open on the 000581 ** row or one of the rows being modified. 000582 ** 000583 ** If argument isClearTable is true, then the entire contents of the 000584 ** table is about to be deleted. In this case invalidate all incrblob 000585 ** cursors open on any row within the table with root-page pgnoRoot. 000586 ** 000587 ** Otherwise, if argument isClearTable is false, then the row with 000588 ** rowid iRow is being replaced or deleted. In this case invalidate 000589 ** only those incrblob cursors open on that specific row. 000590 */ 000591 static void invalidateIncrblobCursors( 000592 Btree *pBtree, /* The database file to check */ 000593 Pgno pgnoRoot, /* The table that might be changing */ 000594 i64 iRow, /* The rowid that might be changing */ 000595 int isClearTable /* True if all rows are being deleted */ 000596 ){ 000597 BtCursor *p; 000598 assert( pBtree->hasIncrblobCur ); 000599 assert( sqlite3BtreeHoldsMutex(pBtree) ); 000600 pBtree->hasIncrblobCur = 0; 000601 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 000602 if( (p->curFlags & BTCF_Incrblob)!=0 ){ 000603 pBtree->hasIncrblobCur = 1; 000604 if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){ 000605 p->eState = CURSOR_INVALID; 000606 } 000607 } 000608 } 000609 } 000610 000611 #else 000612 /* Stub function when INCRBLOB is omitted */ 000613 #define invalidateIncrblobCursors(w,x,y,z) 000614 #endif /* SQLITE_OMIT_INCRBLOB */ 000615 000616 /* 000617 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 000618 ** when a page that previously contained data becomes a free-list leaf 000619 ** page. 000620 ** 000621 ** The BtShared.pHasContent bitvec exists to work around an obscure 000622 ** bug caused by the interaction of two useful IO optimizations surrounding 000623 ** free-list leaf pages: 000624 ** 000625 ** 1) When all data is deleted from a page and the page becomes 000626 ** a free-list leaf page, the page is not written to the database 000627 ** (as free-list leaf pages contain no meaningful data). Sometimes 000628 ** such a page is not even journalled (as it will not be modified, 000629 ** why bother journalling it?). 000630 ** 000631 ** 2) When a free-list leaf page is reused, its content is not read 000632 ** from the database or written to the journal file (why should it 000633 ** be, if it is not at all meaningful?). 000634 ** 000635 ** By themselves, these optimizations work fine and provide a handy 000636 ** performance boost to bulk delete or insert operations. However, if 000637 ** a page is moved to the free-list and then reused within the same 000638 ** transaction, a problem comes up. If the page is not journalled when 000639 ** it is moved to the free-list and it is also not journalled when it 000640 ** is extracted from the free-list and reused, then the original data 000641 ** may be lost. In the event of a rollback, it may not be possible 000642 ** to restore the database to its original configuration. 000643 ** 000644 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 000645 ** moved to become a free-list leaf page, the corresponding bit is 000646 ** set in the bitvec. Whenever a leaf page is extracted from the free-list, 000647 ** optimization 2 above is omitted if the corresponding bit is already 000648 ** set in BtShared.pHasContent. The contents of the bitvec are cleared 000649 ** at the end of every transaction. 000650 */ 000651 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){ 000652 int rc = SQLITE_OK; 000653 if( !pBt->pHasContent ){ 000654 assert( pgno<=pBt->nPage ); 000655 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage); 000656 if( !pBt->pHasContent ){ 000657 rc = SQLITE_NOMEM_BKPT; 000658 } 000659 } 000660 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){ 000661 rc = sqlite3BitvecSet(pBt->pHasContent, pgno); 000662 } 000663 return rc; 000664 } 000665 000666 /* 000667 ** Query the BtShared.pHasContent vector. 000668 ** 000669 ** This function is called when a free-list leaf page is removed from the 000670 ** free-list for reuse. It returns false if it is safe to retrieve the 000671 ** page from the pager layer with the 'no-content' flag set. True otherwise. 000672 */ 000673 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){ 000674 Bitvec *p = pBt->pHasContent; 000675 return p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTestNotNull(p, pgno)); 000676 } 000677 000678 /* 000679 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be 000680 ** invoked at the conclusion of each write-transaction. 000681 */ 000682 static void btreeClearHasContent(BtShared *pBt){ 000683 sqlite3BitvecDestroy(pBt->pHasContent); 000684 pBt->pHasContent = 0; 000685 } 000686 000687 /* 000688 ** Release all of the apPage[] pages for a cursor. 000689 */ 000690 static void btreeReleaseAllCursorPages(BtCursor *pCur){ 000691 int i; 000692 if( pCur->iPage>=0 ){ 000693 for(i=0; i<pCur->iPage; i++){ 000694 releasePageNotNull(pCur->apPage[i]); 000695 } 000696 releasePageNotNull(pCur->pPage); 000697 pCur->iPage = -1; 000698 } 000699 } 000700 000701 /* 000702 ** The cursor passed as the only argument must point to a valid entry 000703 ** when this function is called (i.e. have eState==CURSOR_VALID). This 000704 ** function saves the current cursor key in variables pCur->nKey and 000705 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error 000706 ** code otherwise. 000707 ** 000708 ** If the cursor is open on an intkey table, then the integer key 000709 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to 000710 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is 000711 ** set to point to a malloced buffer pCur->nKey bytes in size containing 000712 ** the key. 000713 */ 000714 static int saveCursorKey(BtCursor *pCur){ 000715 int rc = SQLITE_OK; 000716 assert( CURSOR_VALID==pCur->eState ); 000717 assert( 0==pCur->pKey ); 000718 assert( cursorHoldsMutex(pCur) ); 000719 000720 if( pCur->curIntKey ){ 000721 /* Only the rowid is required for a table btree */ 000722 pCur->nKey = sqlite3BtreeIntegerKey(pCur); 000723 }else{ 000724 /* For an index btree, save the complete key content. It is possible 000725 ** that the current key is corrupt. In that case, it is possible that 000726 ** the sqlite3VdbeRecordUnpack() function may overread the buffer by 000727 ** up to the size of 1 varint plus 1 8-byte value when the cursor 000728 ** position is restored. Hence the 17 bytes of padding allocated 000729 ** below. */ 000730 void *pKey; 000731 pCur->nKey = sqlite3BtreePayloadSize(pCur); 000732 pKey = sqlite3Malloc( pCur->nKey + 9 + 8 ); 000733 if( pKey ){ 000734 rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey); 000735 if( rc==SQLITE_OK ){ 000736 memset(((u8*)pKey)+pCur->nKey, 0, 9+8); 000737 pCur->pKey = pKey; 000738 }else{ 000739 sqlite3_free(pKey); 000740 } 000741 }else{ 000742 rc = SQLITE_NOMEM_BKPT; 000743 } 000744 } 000745 assert( !pCur->curIntKey || !pCur->pKey ); 000746 return rc; 000747 } 000748 000749 /* 000750 ** Save the current cursor position in the variables BtCursor.nKey 000751 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. 000752 ** 000753 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID) 000754 ** prior to calling this routine. 000755 */ 000756 static int saveCursorPosition(BtCursor *pCur){ 000757 int rc; 000758 000759 assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState ); 000760 assert( 0==pCur->pKey ); 000761 assert( cursorHoldsMutex(pCur) ); 000762 000763 if( pCur->curFlags & BTCF_Pinned ){ 000764 return SQLITE_CONSTRAINT_PINNED; 000765 } 000766 if( pCur->eState==CURSOR_SKIPNEXT ){ 000767 pCur->eState = CURSOR_VALID; 000768 }else{ 000769 pCur->skipNext = 0; 000770 } 000771 000772 rc = saveCursorKey(pCur); 000773 if( rc==SQLITE_OK ){ 000774 btreeReleaseAllCursorPages(pCur); 000775 pCur->eState = CURSOR_REQUIRESEEK; 000776 } 000777 000778 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast); 000779 return rc; 000780 } 000781 000782 /* Forward reference */ 000783 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*); 000784 000785 /* 000786 ** Save the positions of all cursors (except pExcept) that are open on 000787 ** the table with root-page iRoot. "Saving the cursor position" means that 000788 ** the location in the btree is remembered in such a way that it can be 000789 ** moved back to the same spot after the btree has been modified. This 000790 ** routine is called just before cursor pExcept is used to modify the 000791 ** table, for example in BtreeDelete() or BtreeInsert(). 000792 ** 000793 ** If there are two or more cursors on the same btree, then all such 000794 ** cursors should have their BTCF_Multiple flag set. The btreeCursor() 000795 ** routine enforces that rule. This routine only needs to be called in 000796 ** the uncommon case when pExpect has the BTCF_Multiple flag set. 000797 ** 000798 ** If pExpect!=NULL and if no other cursors are found on the same root-page, 000799 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another 000800 ** pointless call to this routine. 000801 ** 000802 ** Implementation note: This routine merely checks to see if any cursors 000803 ** need to be saved. It calls out to saveCursorsOnList() in the (unusual) 000804 ** event that cursors are in need to being saved. 000805 */ 000806 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ 000807 BtCursor *p; 000808 assert( sqlite3_mutex_held(pBt->mutex) ); 000809 assert( pExcept==0 || pExcept->pBt==pBt ); 000810 for(p=pBt->pCursor; p; p=p->pNext){ 000811 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break; 000812 } 000813 if( p ) return saveCursorsOnList(p, iRoot, pExcept); 000814 if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple; 000815 return SQLITE_OK; 000816 } 000817 000818 /* This helper routine to saveAllCursors does the actual work of saving 000819 ** the cursors if and when a cursor is found that actually requires saving. 000820 ** The common case is that no cursors need to be saved, so this routine is 000821 ** broken out from its caller to avoid unnecessary stack pointer movement. 000822 */ 000823 static int SQLITE_NOINLINE saveCursorsOnList( 000824 BtCursor *p, /* The first cursor that needs saving */ 000825 Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */ 000826 BtCursor *pExcept /* Do not save this cursor */ 000827 ){ 000828 do{ 000829 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){ 000830 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){ 000831 int rc = saveCursorPosition(p); 000832 if( SQLITE_OK!=rc ){ 000833 return rc; 000834 } 000835 }else{ 000836 testcase( p->iPage>=0 ); 000837 btreeReleaseAllCursorPages(p); 000838 } 000839 } 000840 p = p->pNext; 000841 }while( p ); 000842 return SQLITE_OK; 000843 } 000844 000845 /* 000846 ** Clear the current cursor position. 000847 */ 000848 void sqlite3BtreeClearCursor(BtCursor *pCur){ 000849 assert( cursorHoldsMutex(pCur) ); 000850 sqlite3_free(pCur->pKey); 000851 pCur->pKey = 0; 000852 pCur->eState = CURSOR_INVALID; 000853 } 000854 000855 /* 000856 ** In this version of BtreeMoveto, pKey is a packed index record 000857 ** such as is generated by the OP_MakeRecord opcode. Unpack the 000858 ** record and then call sqlite3BtreeIndexMoveto() to do the work. 000859 */ 000860 static int btreeMoveto( 000861 BtCursor *pCur, /* Cursor open on the btree to be searched */ 000862 const void *pKey, /* Packed key if the btree is an index */ 000863 i64 nKey, /* Integer key for tables. Size of pKey for indices */ 000864 int bias, /* Bias search to the high end */ 000865 int *pRes /* Write search results here */ 000866 ){ 000867 int rc; /* Status code */ 000868 UnpackedRecord *pIdxKey; /* Unpacked index key */ 000869 000870 if( pKey ){ 000871 KeyInfo *pKeyInfo = pCur->pKeyInfo; 000872 assert( nKey==(i64)(int)nKey ); 000873 pIdxKey = sqlite3VdbeAllocUnpackedRecord(pKeyInfo); 000874 if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT; 000875 sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey); 000876 if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){ 000877 rc = SQLITE_CORRUPT_BKPT; 000878 }else{ 000879 rc = sqlite3BtreeIndexMoveto(pCur, pIdxKey, pRes); 000880 } 000881 sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey); 000882 }else{ 000883 pIdxKey = 0; 000884 rc = sqlite3BtreeTableMoveto(pCur, nKey, bias, pRes); 000885 } 000886 return rc; 000887 } 000888 000889 /* 000890 ** Restore the cursor to the position it was in (or as close to as possible) 000891 ** when saveCursorPosition() was called. Note that this call deletes the 000892 ** saved position info stored by saveCursorPosition(), so there can be 000893 ** at most one effective restoreCursorPosition() call after each 000894 ** saveCursorPosition(). 000895 */ 000896 static int btreeRestoreCursorPosition(BtCursor *pCur){ 000897 int rc; 000898 int skipNext = 0; 000899 assert( cursorOwnsBtShared(pCur) ); 000900 assert( pCur->eState>=CURSOR_REQUIRESEEK ); 000901 if( pCur->eState==CURSOR_FAULT ){ 000902 return pCur->skipNext; 000903 } 000904 pCur->eState = CURSOR_INVALID; 000905 if( sqlite3FaultSim(410) ){ 000906 rc = SQLITE_IOERR; 000907 }else{ 000908 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext); 000909 } 000910 if( rc==SQLITE_OK ){ 000911 sqlite3_free(pCur->pKey); 000912 pCur->pKey = 0; 000913 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); 000914 if( skipNext ) pCur->skipNext = skipNext; 000915 if( pCur->skipNext && pCur->eState==CURSOR_VALID ){ 000916 pCur->eState = CURSOR_SKIPNEXT; 000917 } 000918 } 000919 return rc; 000920 } 000921 000922 #define restoreCursorPosition(p) \ 000923 (p->eState>=CURSOR_REQUIRESEEK ? \ 000924 btreeRestoreCursorPosition(p) : \ 000925 SQLITE_OK) 000926 000927 /* 000928 ** Determine whether or not a cursor has moved from the position where 000929 ** it was last placed, or has been invalidated for any other reason. 000930 ** Cursors can move when the row they are pointing at is deleted out 000931 ** from under them, for example. Cursor might also move if a btree 000932 ** is rebalanced. 000933 ** 000934 ** Calling this routine with a NULL cursor pointer returns false. 000935 ** 000936 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor 000937 ** back to where it ought to be if this routine returns true. 000938 */ 000939 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){ 000940 assert( EIGHT_BYTE_ALIGNMENT(pCur) 000941 || pCur==sqlite3BtreeFakeValidCursor() ); 000942 assert( offsetof(BtCursor, eState)==0 ); 000943 assert( sizeof(pCur->eState)==1 ); 000944 return CURSOR_VALID != *(u8*)pCur; 000945 } 000946 000947 /* 000948 ** Return a pointer to a fake BtCursor object that will always answer 000949 ** false to the sqlite3BtreeCursorHasMoved() routine above. The fake 000950 ** cursor returned must not be used with any other Btree interface. 000951 */ 000952 BtCursor *sqlite3BtreeFakeValidCursor(void){ 000953 static u8 fakeCursor = CURSOR_VALID; 000954 assert( offsetof(BtCursor, eState)==0 ); 000955 return (BtCursor*)&fakeCursor; 000956 } 000957 000958 /* 000959 ** This routine restores a cursor back to its original position after it 000960 ** has been moved by some outside activity (such as a btree rebalance or 000961 ** a row having been deleted out from under the cursor). 000962 ** 000963 ** On success, the *pDifferentRow parameter is false if the cursor is left 000964 ** pointing at exactly the same row. *pDifferntRow is the row the cursor 000965 ** was pointing to has been deleted, forcing the cursor to point to some 000966 ** nearby row. 000967 ** 000968 ** This routine should only be called for a cursor that just returned 000969 ** TRUE from sqlite3BtreeCursorHasMoved(). 000970 */ 000971 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){ 000972 int rc; 000973 000974 assert( pCur!=0 ); 000975 assert( pCur->eState!=CURSOR_VALID ); 000976 rc = restoreCursorPosition(pCur); 000977 if( rc ){ 000978 *pDifferentRow = 1; 000979 return rc; 000980 } 000981 if( pCur->eState!=CURSOR_VALID ){ 000982 *pDifferentRow = 1; 000983 }else{ 000984 *pDifferentRow = 0; 000985 } 000986 return SQLITE_OK; 000987 } 000988 000989 #ifdef SQLITE_ENABLE_CURSOR_HINTS 000990 /* 000991 ** Provide hints to the cursor. The particular hint given (and the type 000992 ** and number of the varargs parameters) is determined by the eHintType 000993 ** parameter. See the definitions of the BTREE_HINT_* macros for details. 000994 */ 000995 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){ 000996 /* Used only by system that substitute their own storage engine */ 000997 #ifdef SQLITE_DEBUG 000998 if( ALWAYS(eHintType==BTREE_HINT_RANGE) ){ 000999 va_list ap; 001000 Expr *pExpr; 001001 Walker w; 001002 memset(&w, 0, sizeof(w)); 001003 w.xExprCallback = sqlite3CursorRangeHintExprCheck; 001004 va_start(ap, eHintType); 001005 pExpr = va_arg(ap, Expr*); 001006 w.u.aMem = va_arg(ap, Mem*); 001007 va_end(ap); 001008 assert( pExpr!=0 ); 001009 assert( w.u.aMem!=0 ); 001010 sqlite3WalkExpr(&w, pExpr); 001011 } 001012 #endif /* SQLITE_DEBUG */ 001013 } 001014 #endif /* SQLITE_ENABLE_CURSOR_HINTS */ 001015 001016 001017 /* 001018 ** Provide flag hints to the cursor. 001019 */ 001020 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){ 001021 assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 ); 001022 pCur->hints = x; 001023 } 001024 001025 001026 #ifndef SQLITE_OMIT_AUTOVACUUM 001027 /* 001028 ** Given a page number of a regular database page, return the page 001029 ** number for the pointer-map page that contains the entry for the 001030 ** input page number. 001031 ** 001032 ** Return 0 (not a valid page) for pgno==1 since there is 001033 ** no pointer map associated with page 1. The integrity_check logic 001034 ** requires that ptrmapPageno(*,1)!=1. 001035 */ 001036 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ 001037 int nPagesPerMapPage; 001038 Pgno iPtrMap, ret; 001039 assert( sqlite3_mutex_held(pBt->mutex) ); 001040 if( pgno<2 ) return 0; 001041 nPagesPerMapPage = (pBt->usableSize/5)+1; 001042 iPtrMap = (pgno-2)/nPagesPerMapPage; 001043 ret = (iPtrMap*nPagesPerMapPage) + 2; 001044 if( ret==PENDING_BYTE_PAGE(pBt) ){ 001045 ret++; 001046 } 001047 return ret; 001048 } 001049 001050 /* 001051 ** Write an entry into the pointer map. 001052 ** 001053 ** This routine updates the pointer map entry for page number 'key' 001054 ** so that it maps to type 'eType' and parent page number 'pgno'. 001055 ** 001056 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is 001057 ** a no-op. If an error occurs, the appropriate error code is written 001058 ** into *pRC. 001059 */ 001060 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){ 001061 DbPage *pDbPage; /* The pointer map page */ 001062 u8 *pPtrmap; /* The pointer map data */ 001063 Pgno iPtrmap; /* The pointer map page number */ 001064 int offset; /* Offset in pointer map page */ 001065 int rc; /* Return code from subfunctions */ 001066 001067 if( *pRC ) return; 001068 001069 assert( sqlite3_mutex_held(pBt->mutex) ); 001070 /* The super-journal page number must never be used as a pointer map page */ 001071 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); 001072 001073 assert( pBt->autoVacuum ); 001074 if( key==0 ){ 001075 *pRC = SQLITE_CORRUPT_BKPT; 001076 return; 001077 } 001078 iPtrmap = PTRMAP_PAGENO(pBt, key); 001079 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0); 001080 if( rc!=SQLITE_OK ){ 001081 *pRC = rc; 001082 return; 001083 } 001084 if( ((char*)sqlite3PagerGetExtra(pDbPage))[0]!=0 ){ 001085 /* The first byte of the extra data is the MemPage.isInit byte. 001086 ** If that byte is set, it means this page is also being used 001087 ** as a btree page. */ 001088 *pRC = SQLITE_CORRUPT_BKPT; 001089 goto ptrmap_exit; 001090 } 001091 offset = PTRMAP_PTROFFSET(iPtrmap, key); 001092 if( offset<0 ){ 001093 *pRC = SQLITE_CORRUPT_BKPT; 001094 goto ptrmap_exit; 001095 } 001096 assert( offset <= (int)pBt->usableSize-5 ); 001097 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 001098 001099 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ 001100 TRACE(("PTRMAP_UPDATE: %u->(%u,%u)\n", key, eType, parent)); 001101 *pRC= rc = sqlite3PagerWrite(pDbPage); 001102 if( rc==SQLITE_OK ){ 001103 pPtrmap[offset] = eType; 001104 put4byte(&pPtrmap[offset+1], parent); 001105 } 001106 } 001107 001108 ptrmap_exit: 001109 sqlite3PagerUnref(pDbPage); 001110 } 001111 001112 /* 001113 ** Read an entry from the pointer map. 001114 ** 001115 ** This routine retrieves the pointer map entry for page 'key', writing 001116 ** the type and parent page number to *pEType and *pPgno respectively. 001117 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 001118 */ 001119 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ 001120 DbPage *pDbPage; /* The pointer map page */ 001121 int iPtrmap; /* Pointer map page index */ 001122 u8 *pPtrmap; /* Pointer map page data */ 001123 int offset; /* Offset of entry in pointer map */ 001124 int rc; 001125 001126 assert( sqlite3_mutex_held(pBt->mutex) ); 001127 001128 iPtrmap = PTRMAP_PAGENO(pBt, key); 001129 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0); 001130 if( rc!=0 ){ 001131 return rc; 001132 } 001133 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 001134 001135 offset = PTRMAP_PTROFFSET(iPtrmap, key); 001136 if( offset<0 ){ 001137 sqlite3PagerUnref(pDbPage); 001138 return SQLITE_CORRUPT_BKPT; 001139 } 001140 assert( offset <= (int)pBt->usableSize-5 ); 001141 assert( pEType!=0 ); 001142 *pEType = pPtrmap[offset]; 001143 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); 001144 001145 sqlite3PagerUnref(pDbPage); 001146 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap); 001147 return SQLITE_OK; 001148 } 001149 001150 #else /* if defined SQLITE_OMIT_AUTOVACUUM */ 001151 #define ptrmapPut(w,x,y,z,rc) 001152 #define ptrmapGet(w,x,y,z) SQLITE_OK 001153 #define ptrmapPutOvflPtr(x, y, z, rc) 001154 #endif 001155 001156 /* 001157 ** Given a btree page and a cell index (0 means the first cell on 001158 ** the page, 1 means the second cell, and so forth) return a pointer 001159 ** to the cell content. 001160 ** 001161 ** findCellPastPtr() does the same except it skips past the initial 001162 ** 4-byte child pointer found on interior pages, if there is one. 001163 ** 001164 ** This routine works only for pages that do not contain overflow cells. 001165 */ 001166 #define findCell(P,I) \ 001167 ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)]))) 001168 #define findCellPastPtr(P,I) \ 001169 ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)]))) 001170 001171 001172 /* 001173 ** This is common tail processing for btreeParseCellPtr() and 001174 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely 001175 ** on a single B-tree page. Make necessary adjustments to the CellInfo 001176 ** structure. 001177 */ 001178 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow( 001179 MemPage *pPage, /* Page containing the cell */ 001180 u8 *pCell, /* Pointer to the cell text. */ 001181 CellInfo *pInfo /* Fill in this structure */ 001182 ){ 001183 /* If the payload will not fit completely on the local page, we have 001184 ** to decide how much to store locally and how much to spill onto 001185 ** overflow pages. The strategy is to minimize the amount of unused 001186 ** space on overflow pages while keeping the amount of local storage 001187 ** in between minLocal and maxLocal. 001188 ** 001189 ** Warning: changing the way overflow payload is distributed in any 001190 ** way will result in an incompatible file format. 001191 */ 001192 int minLocal; /* Minimum amount of payload held locally */ 001193 int maxLocal; /* Maximum amount of payload held locally */ 001194 int surplus; /* Overflow payload available for local storage */ 001195 001196 minLocal = pPage->minLocal; 001197 maxLocal = pPage->maxLocal; 001198 surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4); 001199 testcase( surplus==maxLocal ); 001200 testcase( surplus==maxLocal+1 ); 001201 if( surplus <= maxLocal ){ 001202 pInfo->nLocal = (u16)surplus; 001203 }else{ 001204 pInfo->nLocal = (u16)minLocal; 001205 } 001206 pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4; 001207 } 001208 001209 /* 001210 ** Given a record with nPayload bytes of payload stored within btree 001211 ** page pPage, return the number of bytes of payload stored locally. 001212 */ 001213 static int btreePayloadToLocal(MemPage *pPage, i64 nPayload){ 001214 int maxLocal; /* Maximum amount of payload held locally */ 001215 maxLocal = pPage->maxLocal; 001216 if( nPayload<=maxLocal ){ 001217 return nPayload; 001218 }else{ 001219 int minLocal; /* Minimum amount of payload held locally */ 001220 int surplus; /* Overflow payload available for local storage */ 001221 minLocal = pPage->minLocal; 001222 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize-4); 001223 return ( surplus <= maxLocal ) ? surplus : minLocal; 001224 } 001225 } 001226 001227 /* 001228 ** The following routines are implementations of the MemPage.xParseCell() 001229 ** method. 001230 ** 001231 ** Parse a cell content block and fill in the CellInfo structure. 001232 ** 001233 ** btreeParseCellPtr() => table btree leaf nodes 001234 ** btreeParseCellNoPayload() => table btree internal nodes 001235 ** btreeParseCellPtrIndex() => index btree nodes 001236 ** 001237 ** There is also a wrapper function btreeParseCell() that works for 001238 ** all MemPage types and that references the cell by index rather than 001239 ** by pointer. 001240 */ 001241 static void btreeParseCellPtrNoPayload( 001242 MemPage *pPage, /* Page containing the cell */ 001243 u8 *pCell, /* Pointer to the cell text. */ 001244 CellInfo *pInfo /* Fill in this structure */ 001245 ){ 001246 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001247 assert( pPage->leaf==0 ); 001248 assert( pPage->childPtrSize==4 ); 001249 #ifndef SQLITE_DEBUG 001250 UNUSED_PARAMETER(pPage); 001251 #endif 001252 pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey); 001253 pInfo->nPayload = 0; 001254 pInfo->nLocal = 0; 001255 pInfo->pPayload = 0; 001256 return; 001257 } 001258 static void btreeParseCellPtr( 001259 MemPage *pPage, /* Page containing the cell */ 001260 u8 *pCell, /* Pointer to the cell text. */ 001261 CellInfo *pInfo /* Fill in this structure */ 001262 ){ 001263 u8 *pIter; /* For scanning through pCell */ 001264 u32 nPayload; /* Number of bytes of cell payload */ 001265 u64 iKey; /* Extracted Key value */ 001266 001267 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001268 assert( pPage->leaf==0 || pPage->leaf==1 ); 001269 assert( pPage->intKeyLeaf ); 001270 assert( pPage->childPtrSize==0 ); 001271 pIter = pCell; 001272 001273 /* The next block of code is equivalent to: 001274 ** 001275 ** pIter += getVarint32(pIter, nPayload); 001276 ** 001277 ** The code is inlined to avoid a function call. 001278 */ 001279 nPayload = *pIter; 001280 if( nPayload>=0x80 ){ 001281 u8 *pEnd = &pIter[8]; 001282 nPayload &= 0x7f; 001283 do{ 001284 nPayload = (nPayload<<7) | (*++pIter & 0x7f); 001285 }while( (*pIter)>=0x80 && pIter<pEnd ); 001286 } 001287 pIter++; 001288 001289 /* The next block of code is equivalent to: 001290 ** 001291 ** pIter += getVarint(pIter, (u64*)&pInfo->nKey); 001292 ** 001293 ** The code is inlined and the loop is unrolled for performance. 001294 ** This routine is a high-runner. 001295 */ 001296 iKey = *pIter; 001297 if( iKey>=0x80 ){ 001298 u8 x; 001299 iKey = (iKey<<7) ^ (x = *++pIter); 001300 if( x>=0x80 ){ 001301 iKey = (iKey<<7) ^ (x = *++pIter); 001302 if( x>=0x80 ){ 001303 iKey = (iKey<<7) ^ 0x10204000 ^ (x = *++pIter); 001304 if( x>=0x80 ){ 001305 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001306 if( x>=0x80 ){ 001307 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001308 if( x>=0x80 ){ 001309 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001310 if( x>=0x80 ){ 001311 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001312 if( x>=0x80 ){ 001313 iKey = (iKey<<8) ^ 0x8000 ^ (*++pIter); 001314 } 001315 } 001316 } 001317 } 001318 } 001319 }else{ 001320 iKey ^= 0x204000; 001321 } 001322 }else{ 001323 iKey ^= 0x4000; 001324 } 001325 } 001326 pIter++; 001327 001328 pInfo->nKey = *(i64*)&iKey; 001329 pInfo->nPayload = nPayload; 001330 pInfo->pPayload = pIter; 001331 testcase( nPayload==pPage->maxLocal ); 001332 testcase( nPayload==(u32)pPage->maxLocal+1 ); 001333 if( nPayload<=pPage->maxLocal ){ 001334 /* This is the (easy) common case where the entire payload fits 001335 ** on the local page. No overflow is required. 001336 */ 001337 pInfo->nSize = nPayload + (u16)(pIter - pCell); 001338 if( pInfo->nSize<4 ) pInfo->nSize = 4; 001339 pInfo->nLocal = (u16)nPayload; 001340 }else{ 001341 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo); 001342 } 001343 } 001344 static void btreeParseCellPtrIndex( 001345 MemPage *pPage, /* Page containing the cell */ 001346 u8 *pCell, /* Pointer to the cell text. */ 001347 CellInfo *pInfo /* Fill in this structure */ 001348 ){ 001349 u8 *pIter; /* For scanning through pCell */ 001350 u32 nPayload; /* Number of bytes of cell payload */ 001351 001352 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001353 assert( pPage->leaf==0 || pPage->leaf==1 ); 001354 assert( pPage->intKeyLeaf==0 ); 001355 pIter = pCell + pPage->childPtrSize; 001356 nPayload = *pIter; 001357 if( nPayload>=0x80 ){ 001358 u8 *pEnd = &pIter[8]; 001359 nPayload &= 0x7f; 001360 do{ 001361 nPayload = (nPayload<<7) | (*++pIter & 0x7f); 001362 }while( *(pIter)>=0x80 && pIter<pEnd ); 001363 } 001364 pIter++; 001365 pInfo->nKey = nPayload; 001366 pInfo->nPayload = nPayload; 001367 pInfo->pPayload = pIter; 001368 testcase( nPayload==pPage->maxLocal ); 001369 testcase( nPayload==(u32)pPage->maxLocal+1 ); 001370 if( nPayload<=pPage->maxLocal ){ 001371 /* This is the (easy) common case where the entire payload fits 001372 ** on the local page. No overflow is required. 001373 */ 001374 pInfo->nSize = nPayload + (u16)(pIter - pCell); 001375 if( pInfo->nSize<4 ) pInfo->nSize = 4; 001376 pInfo->nLocal = (u16)nPayload; 001377 }else{ 001378 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo); 001379 } 001380 } 001381 static void btreeParseCell( 001382 MemPage *pPage, /* Page containing the cell */ 001383 int iCell, /* The cell index. First cell is 0 */ 001384 CellInfo *pInfo /* Fill in this structure */ 001385 ){ 001386 pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo); 001387 } 001388 001389 /* 001390 ** The following routines are implementations of the MemPage.xCellSize 001391 ** method. 001392 ** 001393 ** Compute the total number of bytes that a Cell needs in the cell 001394 ** data area of the btree-page. The return number includes the cell 001395 ** data header and the local payload, but not any overflow page or 001396 ** the space used by the cell pointer. 001397 ** 001398 ** cellSizePtrNoPayload() => table internal nodes 001399 ** cellSizePtrTableLeaf() => table leaf nodes 001400 ** cellSizePtr() => index internal nodes 001401 ** cellSizeIdxLeaf() => index leaf nodes 001402 */ 001403 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ 001404 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */ 001405 u8 *pEnd; /* End mark for a varint */ 001406 u32 nSize; /* Size value to return */ 001407 001408 #ifdef SQLITE_DEBUG 001409 /* The value returned by this function should always be the same as 001410 ** the (CellInfo.nSize) value found by doing a full parse of the 001411 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001412 ** this function verifies that this invariant is not violated. */ 001413 CellInfo debuginfo; 001414 pPage->xParseCell(pPage, pCell, &debuginfo); 001415 #endif 001416 001417 assert( pPage->childPtrSize==4 ); 001418 nSize = *pIter; 001419 if( nSize>=0x80 ){ 001420 pEnd = &pIter[8]; 001421 nSize &= 0x7f; 001422 do{ 001423 nSize = (nSize<<7) | (*++pIter & 0x7f); 001424 }while( *(pIter)>=0x80 && pIter<pEnd ); 001425 } 001426 pIter++; 001427 testcase( nSize==pPage->maxLocal ); 001428 testcase( nSize==(u32)pPage->maxLocal+1 ); 001429 if( nSize<=pPage->maxLocal ){ 001430 nSize += (u32)(pIter - pCell); 001431 assert( nSize>4 ); 001432 }else{ 001433 int minLocal = pPage->minLocal; 001434 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001435 testcase( nSize==pPage->maxLocal ); 001436 testcase( nSize==(u32)pPage->maxLocal+1 ); 001437 if( nSize>pPage->maxLocal ){ 001438 nSize = minLocal; 001439 } 001440 nSize += 4 + (u16)(pIter - pCell); 001441 } 001442 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001443 return (u16)nSize; 001444 } 001445 static u16 cellSizePtrIdxLeaf(MemPage *pPage, u8 *pCell){ 001446 u8 *pIter = pCell; /* For looping over bytes of pCell */ 001447 u8 *pEnd; /* End mark for a varint */ 001448 u32 nSize; /* Size value to return */ 001449 001450 #ifdef SQLITE_DEBUG 001451 /* The value returned by this function should always be the same as 001452 ** the (CellInfo.nSize) value found by doing a full parse of the 001453 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001454 ** this function verifies that this invariant is not violated. */ 001455 CellInfo debuginfo; 001456 pPage->xParseCell(pPage, pCell, &debuginfo); 001457 #endif 001458 001459 assert( pPage->childPtrSize==0 ); 001460 nSize = *pIter; 001461 if( nSize>=0x80 ){ 001462 pEnd = &pIter[8]; 001463 nSize &= 0x7f; 001464 do{ 001465 nSize = (nSize<<7) | (*++pIter & 0x7f); 001466 }while( *(pIter)>=0x80 && pIter<pEnd ); 001467 } 001468 pIter++; 001469 testcase( nSize==pPage->maxLocal ); 001470 testcase( nSize==(u32)pPage->maxLocal+1 ); 001471 if( nSize<=pPage->maxLocal ){ 001472 nSize += (u32)(pIter - pCell); 001473 if( nSize<4 ) nSize = 4; 001474 }else{ 001475 int minLocal = pPage->minLocal; 001476 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001477 testcase( nSize==pPage->maxLocal ); 001478 testcase( nSize==(u32)pPage->maxLocal+1 ); 001479 if( nSize>pPage->maxLocal ){ 001480 nSize = minLocal; 001481 } 001482 nSize += 4 + (u16)(pIter - pCell); 001483 } 001484 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001485 return (u16)nSize; 001486 } 001487 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){ 001488 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */ 001489 u8 *pEnd; /* End mark for a varint */ 001490 001491 #ifdef SQLITE_DEBUG 001492 /* The value returned by this function should always be the same as 001493 ** the (CellInfo.nSize) value found by doing a full parse of the 001494 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001495 ** this function verifies that this invariant is not violated. */ 001496 CellInfo debuginfo; 001497 pPage->xParseCell(pPage, pCell, &debuginfo); 001498 #else 001499 UNUSED_PARAMETER(pPage); 001500 #endif 001501 001502 assert( pPage->childPtrSize==4 ); 001503 pEnd = pIter + 9; 001504 while( (*pIter++)&0x80 && pIter<pEnd ); 001505 assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB ); 001506 return (u16)(pIter - pCell); 001507 } 001508 static u16 cellSizePtrTableLeaf(MemPage *pPage, u8 *pCell){ 001509 u8 *pIter = pCell; /* For looping over bytes of pCell */ 001510 u8 *pEnd; /* End mark for a varint */ 001511 u32 nSize; /* Size value to return */ 001512 001513 #ifdef SQLITE_DEBUG 001514 /* The value returned by this function should always be the same as 001515 ** the (CellInfo.nSize) value found by doing a full parse of the 001516 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001517 ** this function verifies that this invariant is not violated. */ 001518 CellInfo debuginfo; 001519 pPage->xParseCell(pPage, pCell, &debuginfo); 001520 #endif 001521 001522 nSize = *pIter; 001523 if( nSize>=0x80 ){ 001524 pEnd = &pIter[8]; 001525 nSize &= 0x7f; 001526 do{ 001527 nSize = (nSize<<7) | (*++pIter & 0x7f); 001528 }while( *(pIter)>=0x80 && pIter<pEnd ); 001529 } 001530 pIter++; 001531 /* pIter now points at the 64-bit integer key value, a variable length 001532 ** integer. The following block moves pIter to point at the first byte 001533 ** past the end of the key value. */ 001534 if( (*pIter++)&0x80 001535 && (*pIter++)&0x80 001536 && (*pIter++)&0x80 001537 && (*pIter++)&0x80 001538 && (*pIter++)&0x80 001539 && (*pIter++)&0x80 001540 && (*pIter++)&0x80 001541 && (*pIter++)&0x80 ){ pIter++; } 001542 testcase( nSize==pPage->maxLocal ); 001543 testcase( nSize==(u32)pPage->maxLocal+1 ); 001544 if( nSize<=pPage->maxLocal ){ 001545 nSize += (u32)(pIter - pCell); 001546 if( nSize<4 ) nSize = 4; 001547 }else{ 001548 int minLocal = pPage->minLocal; 001549 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001550 testcase( nSize==pPage->maxLocal ); 001551 testcase( nSize==(u32)pPage->maxLocal+1 ); 001552 if( nSize>pPage->maxLocal ){ 001553 nSize = minLocal; 001554 } 001555 nSize += 4 + (u16)(pIter - pCell); 001556 } 001557 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001558 return (u16)nSize; 001559 } 001560 001561 001562 #ifdef SQLITE_DEBUG 001563 /* This variation on cellSizePtr() is used inside of assert() statements 001564 ** only. */ 001565 static u16 cellSize(MemPage *pPage, int iCell){ 001566 return pPage->xCellSize(pPage, findCell(pPage, iCell)); 001567 } 001568 #endif 001569 001570 #ifndef SQLITE_OMIT_AUTOVACUUM 001571 /* 001572 ** The cell pCell is currently part of page pSrc but will ultimately be part 001573 ** of pPage. (pSrc and pPage are often the same.) If pCell contains a 001574 ** pointer to an overflow page, insert an entry into the pointer-map for 001575 ** the overflow page that will be valid after pCell has been moved to pPage. 001576 */ 001577 static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){ 001578 CellInfo info; 001579 if( *pRC ) return; 001580 assert( pCell!=0 ); 001581 pPage->xParseCell(pPage, pCell, &info); 001582 if( info.nLocal<info.nPayload ){ 001583 Pgno ovfl; 001584 if( SQLITE_OVERFLOW(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){ 001585 testcase( pSrc!=pPage ); 001586 *pRC = SQLITE_CORRUPT_BKPT; 001587 return; 001588 } 001589 ovfl = get4byte(&pCell[info.nSize-4]); 001590 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC); 001591 } 001592 } 001593 #endif 001594 001595 001596 /* 001597 ** Defragment the page given. This routine reorganizes cells within the 001598 ** page so that there are no free-blocks on the free-block list. 001599 ** 001600 ** Parameter nMaxFrag is the maximum amount of fragmented space that may be 001601 ** present in the page after this routine returns. 001602 ** 001603 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a 001604 ** b-tree page so that there are no freeblocks or fragment bytes, all 001605 ** unused bytes are contained in the unallocated space region, and all 001606 ** cells are packed tightly at the end of the page. 001607 */ 001608 static int defragmentPage(MemPage *pPage, int nMaxFrag){ 001609 int i; /* Loop counter */ 001610 int pc; /* Address of the i-th cell */ 001611 int hdr; /* Offset to the page header */ 001612 int size; /* Size of a cell */ 001613 int usableSize; /* Number of usable bytes on a page */ 001614 int cellOffset; /* Offset to the cell pointer array */ 001615 int cbrk; /* Offset to the cell content area */ 001616 int nCell; /* Number of cells on the page */ 001617 unsigned char *data; /* The page data */ 001618 unsigned char *temp; /* Temp area for cell content */ 001619 unsigned char *src; /* Source of content */ 001620 int iCellFirst; /* First allowable cell index */ 001621 int iCellLast; /* Last possible cell index */ 001622 int iCellStart; /* First cell offset in input */ 001623 001624 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001625 assert( pPage->pBt!=0 ); 001626 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); 001627 assert( pPage->nOverflow==0 ); 001628 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001629 data = pPage->aData; 001630 hdr = pPage->hdrOffset; 001631 cellOffset = pPage->cellOffset; 001632 nCell = pPage->nCell; 001633 assert( nCell==get2byte(&data[hdr+3]) || CORRUPT_DB ); 001634 iCellFirst = cellOffset + 2*nCell; 001635 usableSize = pPage->pBt->usableSize; 001636 001637 /* This block handles pages with two or fewer free blocks and nMaxFrag 001638 ** or fewer fragmented bytes. In this case it is faster to move the 001639 ** two (or one) blocks of cells using memmove() and add the required 001640 ** offsets to each pointer in the cell-pointer array than it is to 001641 ** reconstruct the entire page. */ 001642 if( (int)data[hdr+7]<=nMaxFrag ){ 001643 int iFree = get2byte(&data[hdr+1]); 001644 if( iFree>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage); 001645 if( iFree ){ 001646 int iFree2 = get2byte(&data[iFree]); 001647 if( iFree2>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage); 001648 if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){ 001649 u8 *pEnd = &data[cellOffset + nCell*2]; 001650 u8 *pAddr; 001651 int sz2 = 0; 001652 int sz = get2byte(&data[iFree+2]); 001653 int top = get2byte(&data[hdr+5]); 001654 if( top>=iFree ){ 001655 return SQLITE_CORRUPT_PAGE(pPage); 001656 } 001657 if( iFree2 ){ 001658 if( iFree+sz>iFree2 ) return SQLITE_CORRUPT_PAGE(pPage); 001659 sz2 = get2byte(&data[iFree2+2]); 001660 if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage); 001661 memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz)); 001662 sz += sz2; 001663 }else if( iFree+sz>usableSize ){ 001664 return SQLITE_CORRUPT_PAGE(pPage); 001665 } 001666 001667 cbrk = top+sz; 001668 assert( cbrk+(iFree-top) <= usableSize ); 001669 memmove(&data[cbrk], &data[top], iFree-top); 001670 for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){ 001671 pc = get2byte(pAddr); 001672 if( pc<iFree ){ put2byte(pAddr, pc+sz); } 001673 else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); } 001674 } 001675 goto defragment_out; 001676 } 001677 } 001678 } 001679 001680 cbrk = usableSize; 001681 iCellLast = usableSize - 4; 001682 iCellStart = get2byte(&data[hdr+5]); 001683 if( nCell>0 ){ 001684 temp = sqlite3PagerTempSpace(pPage->pBt->pPager); 001685 memcpy(temp, data, usableSize); 001686 src = temp; 001687 for(i=0; i<nCell; i++){ 001688 u8 *pAddr; /* The i-th cell pointer */ 001689 pAddr = &data[cellOffset + i*2]; 001690 pc = get2byte(pAddr); 001691 testcase( pc==iCellFirst ); 001692 testcase( pc==iCellLast ); 001693 /* These conditions have already been verified in btreeInitPage() 001694 ** if PRAGMA cell_size_check=ON. 001695 */ 001696 if( pc>iCellLast ){ 001697 return SQLITE_CORRUPT_PAGE(pPage); 001698 } 001699 assert( pc>=0 && pc<=iCellLast ); 001700 size = pPage->xCellSize(pPage, &src[pc]); 001701 cbrk -= size; 001702 if( cbrk<iCellStart || pc+size>usableSize ){ 001703 return SQLITE_CORRUPT_PAGE(pPage); 001704 } 001705 assert( cbrk+size<=usableSize && cbrk>=iCellStart ); 001706 testcase( cbrk+size==usableSize ); 001707 testcase( pc+size==usableSize ); 001708 put2byte(pAddr, cbrk); 001709 memcpy(&data[cbrk], &src[pc], size); 001710 } 001711 } 001712 data[hdr+7] = 0; 001713 001714 defragment_out: 001715 assert( pPage->nFree>=0 ); 001716 if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){ 001717 return SQLITE_CORRUPT_PAGE(pPage); 001718 } 001719 assert( cbrk>=iCellFirst ); 001720 put2byte(&data[hdr+5], cbrk); 001721 data[hdr+1] = 0; 001722 data[hdr+2] = 0; 001723 memset(&data[iCellFirst], 0, cbrk-iCellFirst); 001724 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001725 return SQLITE_OK; 001726 } 001727 001728 /* 001729 ** Search the free-list on page pPg for space to store a cell nByte bytes in 001730 ** size. If one can be found, return a pointer to the space and remove it 001731 ** from the free-list. 001732 ** 001733 ** If no suitable space can be found on the free-list, return NULL. 001734 ** 001735 ** This function may detect corruption within pPg. If corruption is 001736 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned. 001737 ** 001738 ** Slots on the free list that are between 1 and 3 bytes larger than nByte 001739 ** will be ignored if adding the extra space to the fragmentation count 001740 ** causes the fragmentation count to exceed 60. 001741 */ 001742 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ 001743 const int hdr = pPg->hdrOffset; /* Offset to page header */ 001744 u8 * const aData = pPg->aData; /* Page data */ 001745 int iAddr = hdr + 1; /* Address of ptr to pc */ 001746 u8 *pTmp = &aData[iAddr]; /* Temporary ptr into aData[] */ 001747 int pc = get2byte(pTmp); /* Address of a free slot */ 001748 int x; /* Excess size of the slot */ 001749 int maxPC = pPg->pBt->usableSize - nByte; /* Max address for a usable slot */ 001750 int size; /* Size of the free slot */ 001751 001752 assert( pc>0 ); 001753 while( pc<=maxPC ){ 001754 /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each 001755 ** freeblock form a big-endian integer which is the size of the freeblock 001756 ** in bytes, including the 4-byte header. */ 001757 pTmp = &aData[pc+2]; 001758 size = get2byte(pTmp); 001759 if( (x = size - nByte)>=0 ){ 001760 testcase( x==4 ); 001761 testcase( x==3 ); 001762 if( x<4 ){ 001763 /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total 001764 ** number of bytes in fragments may not exceed 60. */ 001765 if( aData[hdr+7]>57 ) return 0; 001766 001767 /* Remove the slot from the free-list. Update the number of 001768 ** fragmented bytes within the page. */ 001769 memcpy(&aData[iAddr], &aData[pc], 2); 001770 aData[hdr+7] += (u8)x; 001771 return &aData[pc]; 001772 }else if( x+pc > maxPC ){ 001773 /* This slot extends off the end of the usable part of the page */ 001774 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001775 return 0; 001776 }else{ 001777 /* The slot remains on the free-list. Reduce its size to account 001778 ** for the portion used by the new allocation. */ 001779 put2byte(&aData[pc+2], x); 001780 } 001781 return &aData[pc + x]; 001782 } 001783 iAddr = pc; 001784 pTmp = &aData[pc]; 001785 pc = get2byte(pTmp); 001786 if( pc<=iAddr ){ 001787 if( pc ){ 001788 /* The next slot in the chain comes before the current slot */ 001789 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001790 } 001791 return 0; 001792 } 001793 } 001794 if( pc>maxPC+nByte-4 ){ 001795 /* The free slot chain extends off the end of the page */ 001796 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001797 } 001798 return 0; 001799 } 001800 001801 /* 001802 ** Allocate nByte bytes of space from within the B-Tree page passed 001803 ** as the first argument. Write into *pIdx the index into pPage->aData[] 001804 ** of the first byte of allocated space. Return either SQLITE_OK or 001805 ** an error code (usually SQLITE_CORRUPT). 001806 ** 001807 ** The caller guarantees that there is sufficient space to make the 001808 ** allocation. This routine might need to defragment in order to bring 001809 ** all the space together, however. This routine will avoid using 001810 ** the first two bytes past the cell pointer area since presumably this 001811 ** allocation is being made in order to insert a new cell, so we will 001812 ** also end up needing a new cell pointer. 001813 */ 001814 static SQLITE_INLINE int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ 001815 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ 001816 u8 * const data = pPage->aData; /* Local cache of pPage->aData */ 001817 int top; /* First byte of cell content area */ 001818 int rc = SQLITE_OK; /* Integer return code */ 001819 u8 *pTmp; /* Temp ptr into data[] */ 001820 int gap; /* First byte of gap between cell pointers and cell content */ 001821 001822 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001823 assert( pPage->pBt ); 001824 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001825 assert( nByte>=0 ); /* Minimum cell size is 4 */ 001826 assert( pPage->nFree>=nByte ); 001827 assert( pPage->nOverflow==0 ); 001828 assert( nByte < (int)(pPage->pBt->usableSize-8) ); 001829 001830 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); 001831 gap = pPage->cellOffset + 2*pPage->nCell; 001832 assert( gap<=65536 ); 001833 /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size 001834 ** and the reserved space is zero (the usual value for reserved space) 001835 ** then the cell content offset of an empty page wants to be 65536. 001836 ** However, that integer is too large to be stored in a 2-byte unsigned 001837 ** integer, so a value of 0 is used in its place. */ 001838 pTmp = &data[hdr+5]; 001839 top = get2byte(pTmp); 001840 if( gap>top ){ 001841 if( top==0 && pPage->pBt->usableSize==65536 ){ 001842 top = 65536; 001843 }else{ 001844 return SQLITE_CORRUPT_PAGE(pPage); 001845 } 001846 }else if( top>(int)pPage->pBt->usableSize ){ 001847 return SQLITE_CORRUPT_PAGE(pPage); 001848 } 001849 001850 /* If there is enough space between gap and top for one more cell pointer, 001851 ** and if the freelist is not empty, then search the 001852 ** freelist looking for a slot big enough to satisfy the request. 001853 */ 001854 testcase( gap+2==top ); 001855 testcase( gap+1==top ); 001856 testcase( gap==top ); 001857 if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){ 001858 u8 *pSpace = pageFindSlot(pPage, nByte, &rc); 001859 if( pSpace ){ 001860 int g2; 001861 assert( pSpace+nByte<=data+pPage->pBt->usableSize ); 001862 *pIdx = g2 = (int)(pSpace-data); 001863 if( g2<=gap ){ 001864 return SQLITE_CORRUPT_PAGE(pPage); 001865 }else{ 001866 return SQLITE_OK; 001867 } 001868 }else if( rc ){ 001869 return rc; 001870 } 001871 } 001872 001873 /* The request could not be fulfilled using a freelist slot. Check 001874 ** to see if defragmentation is necessary. 001875 */ 001876 testcase( gap+2+nByte==top ); 001877 if( gap+2+nByte>top ){ 001878 assert( pPage->nCell>0 || CORRUPT_DB ); 001879 assert( pPage->nFree>=0 ); 001880 rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte))); 001881 if( rc ) return rc; 001882 top = get2byteNotZero(&data[hdr+5]); 001883 assert( gap+2+nByte<=top ); 001884 } 001885 001886 001887 /* Allocate memory from the gap in between the cell pointer array 001888 ** and the cell content area. The btreeComputeFreeSpace() call has already 001889 ** validated the freelist. Given that the freelist is valid, there 001890 ** is no way that the allocation can extend off the end of the page. 001891 ** The assert() below verifies the previous sentence. 001892 */ 001893 top -= nByte; 001894 put2byte(&data[hdr+5], top); 001895 assert( top+nByte <= (int)pPage->pBt->usableSize ); 001896 *pIdx = top; 001897 return SQLITE_OK; 001898 } 001899 001900 /* 001901 ** Return a section of the pPage->aData to the freelist. 001902 ** The first byte of the new free block is pPage->aData[iStart] 001903 ** and the size of the block is iSize bytes. 001904 ** 001905 ** Adjacent freeblocks are coalesced. 001906 ** 001907 ** Even though the freeblock list was checked by btreeComputeFreeSpace(), 001908 ** that routine will not detect overlap between cells or freeblocks. Nor 001909 ** does it detect cells or freeblocks that encroach into the reserved bytes 001910 ** at the end of the page. So do additional corruption checks inside this 001911 ** routine and return SQLITE_CORRUPT if any problems are found. 001912 */ 001913 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ 001914 u16 iPtr; /* Address of ptr to next freeblock */ 001915 u16 iFreeBlk; /* Address of the next freeblock */ 001916 u8 hdr; /* Page header size. 0 or 100 */ 001917 u8 nFrag = 0; /* Reduction in fragmentation */ 001918 u16 iOrigSize = iSize; /* Original value of iSize */ 001919 u16 x; /* Offset to cell content area */ 001920 u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */ 001921 unsigned char *data = pPage->aData; /* Page content */ 001922 u8 *pTmp; /* Temporary ptr into data[] */ 001923 001924 assert( pPage->pBt!=0 ); 001925 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001926 assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize ); 001927 assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize ); 001928 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001929 assert( iSize>=4 ); /* Minimum cell size is 4 */ 001930 assert( CORRUPT_DB || iStart<=pPage->pBt->usableSize-4 ); 001931 001932 /* The list of freeblocks must be in ascending order. Find the 001933 ** spot on the list where iStart should be inserted. 001934 */ 001935 hdr = pPage->hdrOffset; 001936 iPtr = hdr + 1; 001937 if( data[iPtr+1]==0 && data[iPtr]==0 ){ 001938 iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */ 001939 }else{ 001940 while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){ 001941 if( iFreeBlk<=iPtr ){ 001942 if( iFreeBlk==0 ) break; /* TH3: corrupt082.100 */ 001943 return SQLITE_CORRUPT_PAGE(pPage); 001944 } 001945 iPtr = iFreeBlk; 001946 } 001947 if( iFreeBlk>pPage->pBt->usableSize-4 ){ /* TH3: corrupt081.100 */ 001948 return SQLITE_CORRUPT_PAGE(pPage); 001949 } 001950 assert( iFreeBlk>iPtr || iFreeBlk==0 || CORRUPT_DB ); 001951 001952 /* At this point: 001953 ** iFreeBlk: First freeblock after iStart, or zero if none 001954 ** iPtr: The address of a pointer to iFreeBlk 001955 ** 001956 ** Check to see if iFreeBlk should be coalesced onto the end of iStart. 001957 */ 001958 if( iFreeBlk && iEnd+3>=iFreeBlk ){ 001959 nFrag = iFreeBlk - iEnd; 001960 if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage); 001961 iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]); 001962 if( iEnd > pPage->pBt->usableSize ){ 001963 return SQLITE_CORRUPT_PAGE(pPage); 001964 } 001965 iSize = iEnd - iStart; 001966 iFreeBlk = get2byte(&data[iFreeBlk]); 001967 } 001968 001969 /* If iPtr is another freeblock (that is, if iPtr is not the freelist 001970 ** pointer in the page header) then check to see if iStart should be 001971 ** coalesced onto the end of iPtr. 001972 */ 001973 if( iPtr>hdr+1 ){ 001974 int iPtrEnd = iPtr + get2byte(&data[iPtr+2]); 001975 if( iPtrEnd+3>=iStart ){ 001976 if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage); 001977 nFrag += iStart - iPtrEnd; 001978 iSize = iEnd - iPtr; 001979 iStart = iPtr; 001980 } 001981 } 001982 if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage); 001983 data[hdr+7] -= nFrag; 001984 } 001985 pTmp = &data[hdr+5]; 001986 x = get2byte(pTmp); 001987 if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){ 001988 /* Overwrite deleted information with zeros when the secure_delete 001989 ** option is enabled */ 001990 memset(&data[iStart], 0, iSize); 001991 } 001992 if( iStart<=x ){ 001993 /* The new freeblock is at the beginning of the cell content area, 001994 ** so just extend the cell content area rather than create another 001995 ** freelist entry */ 001996 if( iStart<x ) return SQLITE_CORRUPT_PAGE(pPage); 001997 if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage); 001998 put2byte(&data[hdr+1], iFreeBlk); 001999 put2byte(&data[hdr+5], iEnd); 002000 }else{ 002001 /* Insert the new freeblock into the freelist */ 002002 put2byte(&data[iPtr], iStart); 002003 put2byte(&data[iStart], iFreeBlk); 002004 put2byte(&data[iStart+2], iSize); 002005 } 002006 pPage->nFree += iOrigSize; 002007 return SQLITE_OK; 002008 } 002009 002010 /* 002011 ** Decode the flags byte (the first byte of the header) for a page 002012 ** and initialize fields of the MemPage structure accordingly. 002013 ** 002014 ** Only the following combinations are supported. Anything different 002015 ** indicates a corrupt database files: 002016 ** 002017 ** PTF_ZERODATA (0x02, 2) 002018 ** PTF_LEAFDATA | PTF_INTKEY (0x05, 5) 002019 ** PTF_ZERODATA | PTF_LEAF (0x0a, 10) 002020 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF (0x0d, 13) 002021 */ 002022 static int decodeFlags(MemPage *pPage, int flagByte){ 002023 BtShared *pBt; /* A copy of pPage->pBt */ 002024 002025 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); 002026 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002027 pBt = pPage->pBt; 002028 pPage->max1bytePayload = pBt->max1bytePayload; 002029 if( flagByte>=(PTF_ZERODATA | PTF_LEAF) ){ 002030 pPage->childPtrSize = 0; 002031 pPage->leaf = 1; 002032 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF) ){ 002033 pPage->intKeyLeaf = 1; 002034 pPage->xCellSize = cellSizePtrTableLeaf; 002035 pPage->xParseCell = btreeParseCellPtr; 002036 pPage->intKey = 1; 002037 pPage->maxLocal = pBt->maxLeaf; 002038 pPage->minLocal = pBt->minLeaf; 002039 }else if( flagByte==(PTF_ZERODATA | PTF_LEAF) ){ 002040 pPage->intKey = 0; 002041 pPage->intKeyLeaf = 0; 002042 pPage->xCellSize = cellSizePtrIdxLeaf; 002043 pPage->xParseCell = btreeParseCellPtrIndex; 002044 pPage->maxLocal = pBt->maxLocal; 002045 pPage->minLocal = pBt->minLocal; 002046 }else{ 002047 pPage->intKey = 0; 002048 pPage->intKeyLeaf = 0; 002049 pPage->xCellSize = cellSizePtrIdxLeaf; 002050 pPage->xParseCell = btreeParseCellPtrIndex; 002051 return SQLITE_CORRUPT_PAGE(pPage); 002052 } 002053 }else{ 002054 pPage->childPtrSize = 4; 002055 pPage->leaf = 0; 002056 if( flagByte==(PTF_ZERODATA) ){ 002057 pPage->intKey = 0; 002058 pPage->intKeyLeaf = 0; 002059 pPage->xCellSize = cellSizePtr; 002060 pPage->xParseCell = btreeParseCellPtrIndex; 002061 pPage->maxLocal = pBt->maxLocal; 002062 pPage->minLocal = pBt->minLocal; 002063 }else if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ 002064 pPage->intKeyLeaf = 0; 002065 pPage->xCellSize = cellSizePtrNoPayload; 002066 pPage->xParseCell = btreeParseCellPtrNoPayload; 002067 pPage->intKey = 1; 002068 pPage->maxLocal = pBt->maxLeaf; 002069 pPage->minLocal = pBt->minLeaf; 002070 }else{ 002071 pPage->intKey = 0; 002072 pPage->intKeyLeaf = 0; 002073 pPage->xCellSize = cellSizePtr; 002074 pPage->xParseCell = btreeParseCellPtrIndex; 002075 return SQLITE_CORRUPT_PAGE(pPage); 002076 } 002077 } 002078 return SQLITE_OK; 002079 } 002080 002081 /* 002082 ** Compute the amount of freespace on the page. In other words, fill 002083 ** in the pPage->nFree field. 002084 */ 002085 static int btreeComputeFreeSpace(MemPage *pPage){ 002086 int pc; /* Address of a freeblock within pPage->aData[] */ 002087 u8 hdr; /* Offset to beginning of page header */ 002088 u8 *data; /* Equal to pPage->aData */ 002089 int usableSize; /* Amount of usable space on each page */ 002090 int nFree; /* Number of unused bytes on the page */ 002091 int top; /* First byte of the cell content area */ 002092 int iCellFirst; /* First allowable cell or freeblock offset */ 002093 int iCellLast; /* Last possible cell or freeblock offset */ 002094 002095 assert( pPage->pBt!=0 ); 002096 assert( pPage->pBt->db!=0 ); 002097 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002098 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 002099 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 002100 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 002101 assert( pPage->isInit==1 ); 002102 assert( pPage->nFree<0 ); 002103 002104 usableSize = pPage->pBt->usableSize; 002105 hdr = pPage->hdrOffset; 002106 data = pPage->aData; 002107 /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates 002108 ** the start of the cell content area. A zero value for this integer is 002109 ** interpreted as 65536. */ 002110 top = get2byteNotZero(&data[hdr+5]); 002111 iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell; 002112 iCellLast = usableSize - 4; 002113 002114 /* Compute the total free space on the page 002115 ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the 002116 ** start of the first freeblock on the page, or is zero if there are no 002117 ** freeblocks. */ 002118 pc = get2byte(&data[hdr+1]); 002119 nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */ 002120 if( pc>0 ){ 002121 u32 next, size; 002122 if( pc<top ){ 002123 /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will 002124 ** always be at least one cell before the first freeblock. 002125 */ 002126 return SQLITE_CORRUPT_PAGE(pPage); 002127 } 002128 while( 1 ){ 002129 if( pc>iCellLast ){ 002130 /* Freeblock off the end of the page */ 002131 return SQLITE_CORRUPT_PAGE(pPage); 002132 } 002133 next = get2byte(&data[pc]); 002134 size = get2byte(&data[pc+2]); 002135 nFree = nFree + size; 002136 if( next<=pc+size+3 ) break; 002137 pc = next; 002138 } 002139 if( next>0 ){ 002140 /* Freeblock not in ascending order */ 002141 return SQLITE_CORRUPT_PAGE(pPage); 002142 } 002143 if( pc+size>(unsigned int)usableSize ){ 002144 /* Last freeblock extends past page end */ 002145 return SQLITE_CORRUPT_PAGE(pPage); 002146 } 002147 } 002148 002149 /* At this point, nFree contains the sum of the offset to the start 002150 ** of the cell-content area plus the number of free bytes within 002151 ** the cell-content area. If this is greater than the usable-size 002152 ** of the page, then the page must be corrupted. This check also 002153 ** serves to verify that the offset to the start of the cell-content 002154 ** area, according to the page header, lies within the page. 002155 */ 002156 if( nFree>usableSize || nFree<iCellFirst ){ 002157 return SQLITE_CORRUPT_PAGE(pPage); 002158 } 002159 pPage->nFree = (u16)(nFree - iCellFirst); 002160 return SQLITE_OK; 002161 } 002162 002163 /* 002164 ** Do additional sanity check after btreeInitPage() if 002165 ** PRAGMA cell_size_check=ON 002166 */ 002167 static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){ 002168 int iCellFirst; /* First allowable cell or freeblock offset */ 002169 int iCellLast; /* Last possible cell or freeblock offset */ 002170 int i; /* Index into the cell pointer array */ 002171 int sz; /* Size of a cell */ 002172 int pc; /* Address of a freeblock within pPage->aData[] */ 002173 u8 *data; /* Equal to pPage->aData */ 002174 int usableSize; /* Maximum usable space on the page */ 002175 int cellOffset; /* Start of cell content area */ 002176 002177 iCellFirst = pPage->cellOffset + 2*pPage->nCell; 002178 usableSize = pPage->pBt->usableSize; 002179 iCellLast = usableSize - 4; 002180 data = pPage->aData; 002181 cellOffset = pPage->cellOffset; 002182 if( !pPage->leaf ) iCellLast--; 002183 for(i=0; i<pPage->nCell; i++){ 002184 pc = get2byteAligned(&data[cellOffset+i*2]); 002185 testcase( pc==iCellFirst ); 002186 testcase( pc==iCellLast ); 002187 if( pc<iCellFirst || pc>iCellLast ){ 002188 return SQLITE_CORRUPT_PAGE(pPage); 002189 } 002190 sz = pPage->xCellSize(pPage, &data[pc]); 002191 testcase( pc+sz==usableSize ); 002192 if( pc+sz>usableSize ){ 002193 return SQLITE_CORRUPT_PAGE(pPage); 002194 } 002195 } 002196 return SQLITE_OK; 002197 } 002198 002199 /* 002200 ** Initialize the auxiliary information for a disk block. 002201 ** 002202 ** Return SQLITE_OK on success. If we see that the page does 002203 ** not contain a well-formed database page, then return 002204 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 002205 ** guarantee that the page is well-formed. It only shows that 002206 ** we failed to detect any corruption. 002207 */ 002208 static int btreeInitPage(MemPage *pPage){ 002209 u8 *data; /* Equal to pPage->aData */ 002210 BtShared *pBt; /* The main btree structure */ 002211 002212 assert( pPage->pBt!=0 ); 002213 assert( pPage->pBt->db!=0 ); 002214 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002215 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 002216 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 002217 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 002218 assert( pPage->isInit==0 ); 002219 002220 pBt = pPage->pBt; 002221 data = pPage->aData + pPage->hdrOffset; 002222 /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating 002223 ** the b-tree page type. */ 002224 if( decodeFlags(pPage, data[0]) ){ 002225 return SQLITE_CORRUPT_PAGE(pPage); 002226 } 002227 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 002228 pPage->maskPage = (u16)(pBt->pageSize - 1); 002229 pPage->nOverflow = 0; 002230 pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize; 002231 pPage->aCellIdx = data + pPage->childPtrSize + 8; 002232 pPage->aDataEnd = pPage->aData + pBt->pageSize; 002233 pPage->aDataOfst = pPage->aData + pPage->childPtrSize; 002234 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the 002235 ** number of cells on the page. */ 002236 pPage->nCell = get2byte(&data[3]); 002237 if( pPage->nCell>MX_CELL(pBt) ){ 002238 /* To many cells for a single page. The page must be corrupt */ 002239 return SQLITE_CORRUPT_PAGE(pPage); 002240 } 002241 testcase( pPage->nCell==MX_CELL(pBt) ); 002242 /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only 002243 ** possible for a root page of a table that contains no rows) then the 002244 ** offset to the cell content area will equal the page size minus the 002245 ** bytes of reserved space. */ 002246 assert( pPage->nCell>0 002247 || get2byteNotZero(&data[5])==(int)pBt->usableSize 002248 || CORRUPT_DB ); 002249 pPage->nFree = -1; /* Indicate that this value is yet uncomputed */ 002250 pPage->isInit = 1; 002251 if( pBt->db->flags & SQLITE_CellSizeCk ){ 002252 return btreeCellSizeCheck(pPage); 002253 } 002254 return SQLITE_OK; 002255 } 002256 002257 /* 002258 ** Set up a raw page so that it looks like a database page holding 002259 ** no entries. 002260 */ 002261 static void zeroPage(MemPage *pPage, int flags){ 002262 unsigned char *data = pPage->aData; 002263 BtShared *pBt = pPage->pBt; 002264 u8 hdr = pPage->hdrOffset; 002265 u16 first; 002266 002267 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno || CORRUPT_DB ); 002268 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002269 assert( sqlite3PagerGetData(pPage->pDbPage) == data ); 002270 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 002271 assert( sqlite3_mutex_held(pBt->mutex) ); 002272 if( pBt->btsFlags & BTS_FAST_SECURE ){ 002273 memset(&data[hdr], 0, pBt->usableSize - hdr); 002274 } 002275 data[hdr] = (char)flags; 002276 first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8); 002277 memset(&data[hdr+1], 0, 4); 002278 data[hdr+7] = 0; 002279 put2byte(&data[hdr+5], pBt->usableSize); 002280 pPage->nFree = (u16)(pBt->usableSize - first); 002281 decodeFlags(pPage, flags); 002282 pPage->cellOffset = first; 002283 pPage->aDataEnd = &data[pBt->pageSize]; 002284 pPage->aCellIdx = &data[first]; 002285 pPage->aDataOfst = &data[pPage->childPtrSize]; 002286 pPage->nOverflow = 0; 002287 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 002288 pPage->maskPage = (u16)(pBt->pageSize - 1); 002289 pPage->nCell = 0; 002290 pPage->isInit = 1; 002291 } 002292 002293 002294 /* 002295 ** Convert a DbPage obtained from the pager into a MemPage used by 002296 ** the btree layer. 002297 */ 002298 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ 002299 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 002300 if( pgno!=pPage->pgno ){ 002301 pPage->aData = sqlite3PagerGetData(pDbPage); 002302 pPage->pDbPage = pDbPage; 002303 pPage->pBt = pBt; 002304 pPage->pgno = pgno; 002305 pPage->hdrOffset = pgno==1 ? 100 : 0; 002306 } 002307 assert( pPage->aData==sqlite3PagerGetData(pDbPage) ); 002308 return pPage; 002309 } 002310 002311 /* 002312 ** Get a page from the pager. Initialize the MemPage.pBt and 002313 ** MemPage.aData elements if needed. See also: btreeGetUnusedPage(). 002314 ** 002315 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care 002316 ** about the content of the page at this time. So do not go to the disk 002317 ** to fetch the content. Just fill in the content with zeros for now. 002318 ** If in the future we call sqlite3PagerWrite() on this page, that 002319 ** means we have started to be concerned about content and the disk 002320 ** read should occur at that point. 002321 */ 002322 static int btreeGetPage( 002323 BtShared *pBt, /* The btree */ 002324 Pgno pgno, /* Number of the page to fetch */ 002325 MemPage **ppPage, /* Return the page in this parameter */ 002326 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */ 002327 ){ 002328 int rc; 002329 DbPage *pDbPage; 002330 002331 assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY ); 002332 assert( sqlite3_mutex_held(pBt->mutex) ); 002333 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags); 002334 if( rc ) return rc; 002335 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt); 002336 return SQLITE_OK; 002337 } 002338 002339 /* 002340 ** Retrieve a page from the pager cache. If the requested page is not 002341 ** already in the pager cache return NULL. Initialize the MemPage.pBt and 002342 ** MemPage.aData elements if needed. 002343 */ 002344 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ 002345 DbPage *pDbPage; 002346 assert( sqlite3_mutex_held(pBt->mutex) ); 002347 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); 002348 if( pDbPage ){ 002349 return btreePageFromDbPage(pDbPage, pgno, pBt); 002350 } 002351 return 0; 002352 } 002353 002354 /* 002355 ** Return the size of the database file in pages. If there is any kind of 002356 ** error, return ((unsigned int)-1). 002357 */ 002358 static Pgno btreePagecount(BtShared *pBt){ 002359 return pBt->nPage; 002360 } 002361 Pgno sqlite3BtreeLastPage(Btree *p){ 002362 assert( sqlite3BtreeHoldsMutex(p) ); 002363 return btreePagecount(p->pBt); 002364 } 002365 002366 /* 002367 ** Get a page from the pager and initialize it. 002368 */ 002369 static int getAndInitPage( 002370 BtShared *pBt, /* The database file */ 002371 Pgno pgno, /* Number of the page to get */ 002372 MemPage **ppPage, /* Write the page pointer here */ 002373 int bReadOnly /* True for a read-only page */ 002374 ){ 002375 int rc; 002376 DbPage *pDbPage; 002377 MemPage *pPage; 002378 assert( sqlite3_mutex_held(pBt->mutex) ); 002379 002380 if( pgno>btreePagecount(pBt) ){ 002381 *ppPage = 0; 002382 return SQLITE_CORRUPT_BKPT; 002383 } 002384 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly); 002385 if( rc ){ 002386 *ppPage = 0; 002387 return rc; 002388 } 002389 pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 002390 if( pPage->isInit==0 ){ 002391 btreePageFromDbPage(pDbPage, pgno, pBt); 002392 rc = btreeInitPage(pPage); 002393 if( rc!=SQLITE_OK ){ 002394 releasePage(pPage); 002395 *ppPage = 0; 002396 return rc; 002397 } 002398 } 002399 assert( pPage->pgno==pgno || CORRUPT_DB ); 002400 assert( pPage->aData==sqlite3PagerGetData(pDbPage) ); 002401 *ppPage = pPage; 002402 return SQLITE_OK; 002403 } 002404 002405 /* 002406 ** Release a MemPage. This should be called once for each prior 002407 ** call to btreeGetPage. 002408 ** 002409 ** Page1 is a special case and must be released using releasePageOne(). 002410 */ 002411 static void releasePageNotNull(MemPage *pPage){ 002412 assert( pPage->aData ); 002413 assert( pPage->pBt ); 002414 assert( pPage->pDbPage!=0 ); 002415 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002416 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 002417 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002418 sqlite3PagerUnrefNotNull(pPage->pDbPage); 002419 } 002420 static void releasePage(MemPage *pPage){ 002421 if( pPage ) releasePageNotNull(pPage); 002422 } 002423 static void releasePageOne(MemPage *pPage){ 002424 assert( pPage!=0 ); 002425 assert( pPage->aData ); 002426 assert( pPage->pBt ); 002427 assert( pPage->pDbPage!=0 ); 002428 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002429 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 002430 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002431 sqlite3PagerUnrefPageOne(pPage->pDbPage); 002432 } 002433 002434 /* 002435 ** Get an unused page. 002436 ** 002437 ** This works just like btreeGetPage() with the addition: 002438 ** 002439 ** * If the page is already in use for some other purpose, immediately 002440 ** release it and return an SQLITE_CURRUPT error. 002441 ** * Make sure the isInit flag is clear 002442 */ 002443 static int btreeGetUnusedPage( 002444 BtShared *pBt, /* The btree */ 002445 Pgno pgno, /* Number of the page to fetch */ 002446 MemPage **ppPage, /* Return the page in this parameter */ 002447 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */ 002448 ){ 002449 int rc = btreeGetPage(pBt, pgno, ppPage, flags); 002450 if( rc==SQLITE_OK ){ 002451 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ 002452 releasePage(*ppPage); 002453 *ppPage = 0; 002454 return SQLITE_CORRUPT_BKPT; 002455 } 002456 (*ppPage)->isInit = 0; 002457 }else{ 002458 *ppPage = 0; 002459 } 002460 return rc; 002461 } 002462 002463 002464 /* 002465 ** During a rollback, when the pager reloads information into the cache 002466 ** so that the cache is restored to its original state at the start of 002467 ** the transaction, for each page restored this routine is called. 002468 ** 002469 ** This routine needs to reset the extra data section at the end of the 002470 ** page to agree with the restored data. 002471 */ 002472 static void pageReinit(DbPage *pData){ 002473 MemPage *pPage; 002474 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 002475 assert( sqlite3PagerPageRefcount(pData)>0 ); 002476 if( pPage->isInit ){ 002477 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002478 pPage->isInit = 0; 002479 if( sqlite3PagerPageRefcount(pData)>1 ){ 002480 /* pPage might not be a btree page; it might be an overflow page 002481 ** or ptrmap page or a free page. In those cases, the following 002482 ** call to btreeInitPage() will likely return SQLITE_CORRUPT. 002483 ** But no harm is done by this. And it is very important that 002484 ** btreeInitPage() be called on every btree page so we make 002485 ** the call for every page that comes in for re-initializing. */ 002486 btreeInitPage(pPage); 002487 } 002488 } 002489 } 002490 002491 /* 002492 ** Invoke the busy handler for a btree. 002493 */ 002494 static int btreeInvokeBusyHandler(void *pArg){ 002495 BtShared *pBt = (BtShared*)pArg; 002496 assert( pBt->db ); 002497 assert( sqlite3_mutex_held(pBt->db->mutex) ); 002498 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler); 002499 } 002500 002501 /* 002502 ** Open a database file. 002503 ** 002504 ** zFilename is the name of the database file. If zFilename is NULL 002505 ** then an ephemeral database is created. The ephemeral database might 002506 ** be exclusively in memory, or it might use a disk-based memory cache. 002507 ** Either way, the ephemeral database will be automatically deleted 002508 ** when sqlite3BtreeClose() is called. 002509 ** 002510 ** If zFilename is ":memory:" then an in-memory database is created 002511 ** that is automatically destroyed when it is closed. 002512 ** 002513 ** The "flags" parameter is a bitmask that might contain bits like 002514 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY. 002515 ** 002516 ** If the database is already opened in the same database connection 002517 ** and we are in shared cache mode, then the open will fail with an 002518 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared 002519 ** objects in the same database connection since doing so will lead 002520 ** to problems with locking. 002521 */ 002522 int sqlite3BtreeOpen( 002523 sqlite3_vfs *pVfs, /* VFS to use for this b-tree */ 002524 const char *zFilename, /* Name of the file containing the BTree database */ 002525 sqlite3 *db, /* Associated database handle */ 002526 Btree **ppBtree, /* Pointer to new Btree object written here */ 002527 int flags, /* Options */ 002528 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ 002529 ){ 002530 BtShared *pBt = 0; /* Shared part of btree structure */ 002531 Btree *p; /* Handle to return */ 002532 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */ 002533 int rc = SQLITE_OK; /* Result code from this function */ 002534 u8 nReserve; /* Byte of unused space on each page */ 002535 unsigned char zDbHeader[100]; /* Database header content */ 002536 002537 /* True if opening an ephemeral, temporary database */ 002538 const int isTempDb = zFilename==0 || zFilename[0]==0; 002539 002540 /* Set the variable isMemdb to true for an in-memory database, or 002541 ** false for a file-based database. 002542 */ 002543 #ifdef SQLITE_OMIT_MEMORYDB 002544 const int isMemdb = 0; 002545 #else 002546 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0) 002547 || (isTempDb && sqlite3TempInMemory(db)) 002548 || (vfsFlags & SQLITE_OPEN_MEMORY)!=0; 002549 #endif 002550 002551 assert( db!=0 ); 002552 assert( pVfs!=0 ); 002553 assert( sqlite3_mutex_held(db->mutex) ); 002554 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */ 002555 002556 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */ 002557 assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 ); 002558 002559 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */ 002560 assert( (flags & BTREE_SINGLE)==0 || isTempDb ); 002561 002562 if( isMemdb ){ 002563 flags |= BTREE_MEMORY; 002564 } 002565 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){ 002566 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB; 002567 } 002568 p = sqlite3MallocZero(sizeof(Btree)); 002569 if( !p ){ 002570 return SQLITE_NOMEM_BKPT; 002571 } 002572 p->inTrans = TRANS_NONE; 002573 p->db = db; 002574 #ifndef SQLITE_OMIT_SHARED_CACHE 002575 p->lock.pBtree = p; 002576 p->lock.iTable = 1; 002577 #endif 002578 002579 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002580 /* 002581 ** If this Btree is a candidate for shared cache, try to find an 002582 ** existing BtShared object that we can share with 002583 */ 002584 if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){ 002585 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){ 002586 int nFilename = sqlite3Strlen30(zFilename)+1; 002587 int nFullPathname = pVfs->mxPathname+1; 002588 char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename)); 002589 MUTEX_LOGIC( sqlite3_mutex *mutexShared; ) 002590 002591 p->sharable = 1; 002592 if( !zFullPathname ){ 002593 sqlite3_free(p); 002594 return SQLITE_NOMEM_BKPT; 002595 } 002596 if( isMemdb ){ 002597 memcpy(zFullPathname, zFilename, nFilename); 002598 }else{ 002599 rc = sqlite3OsFullPathname(pVfs, zFilename, 002600 nFullPathname, zFullPathname); 002601 if( rc ){ 002602 if( rc==SQLITE_OK_SYMLINK ){ 002603 rc = SQLITE_OK; 002604 }else{ 002605 sqlite3_free(zFullPathname); 002606 sqlite3_free(p); 002607 return rc; 002608 } 002609 } 002610 } 002611 #if SQLITE_THREADSAFE 002612 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN); 002613 sqlite3_mutex_enter(mutexOpen); 002614 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN); 002615 sqlite3_mutex_enter(mutexShared); 002616 #endif 002617 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){ 002618 assert( pBt->nRef>0 ); 002619 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0)) 002620 && sqlite3PagerVfs(pBt->pPager)==pVfs ){ 002621 int iDb; 002622 for(iDb=db->nDb-1; iDb>=0; iDb--){ 002623 Btree *pExisting = db->aDb[iDb].pBt; 002624 if( pExisting && pExisting->pBt==pBt ){ 002625 sqlite3_mutex_leave(mutexShared); 002626 sqlite3_mutex_leave(mutexOpen); 002627 sqlite3_free(zFullPathname); 002628 sqlite3_free(p); 002629 return SQLITE_CONSTRAINT; 002630 } 002631 } 002632 p->pBt = pBt; 002633 pBt->nRef++; 002634 break; 002635 } 002636 } 002637 sqlite3_mutex_leave(mutexShared); 002638 sqlite3_free(zFullPathname); 002639 } 002640 #ifdef SQLITE_DEBUG 002641 else{ 002642 /* In debug mode, we mark all persistent databases as sharable 002643 ** even when they are not. This exercises the locking code and 002644 ** gives more opportunity for asserts(sqlite3_mutex_held()) 002645 ** statements to find locking problems. 002646 */ 002647 p->sharable = 1; 002648 } 002649 #endif 002650 } 002651 #endif 002652 if( pBt==0 ){ 002653 /* 002654 ** The following asserts make sure that structures used by the btree are 002655 ** the right size. This is to guard against size changes that result 002656 ** when compiling on a different architecture. 002657 */ 002658 assert( sizeof(i64)==8 ); 002659 assert( sizeof(u64)==8 ); 002660 assert( sizeof(u32)==4 ); 002661 assert( sizeof(u16)==2 ); 002662 assert( sizeof(Pgno)==4 ); 002663 002664 /* Suppress false-positive compiler warning from PVS-Studio */ 002665 memset(&zDbHeader[16], 0, 8); 002666 002667 pBt = sqlite3MallocZero( sizeof(*pBt) ); 002668 if( pBt==0 ){ 002669 rc = SQLITE_NOMEM_BKPT; 002670 goto btree_open_out; 002671 } 002672 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, 002673 sizeof(MemPage), flags, vfsFlags, pageReinit); 002674 if( rc==SQLITE_OK ){ 002675 sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap); 002676 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); 002677 } 002678 if( rc!=SQLITE_OK ){ 002679 goto btree_open_out; 002680 } 002681 pBt->openFlags = (u8)flags; 002682 pBt->db = db; 002683 sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt); 002684 p->pBt = pBt; 002685 002686 pBt->pCursor = 0; 002687 pBt->pPage1 = 0; 002688 if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY; 002689 #if defined(SQLITE_SECURE_DELETE) 002690 pBt->btsFlags |= BTS_SECURE_DELETE; 002691 #elif defined(SQLITE_FAST_SECURE_DELETE) 002692 pBt->btsFlags |= BTS_OVERWRITE; 002693 #endif 002694 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is 002695 ** determined by the 2-byte integer located at an offset of 16 bytes from 002696 ** the beginning of the database file. */ 002697 pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16); 002698 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE 002699 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ 002700 pBt->pageSize = 0; 002701 #ifndef SQLITE_OMIT_AUTOVACUUM 002702 /* If the magic name ":memory:" will create an in-memory database, then 002703 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if 002704 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if 002705 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a 002706 ** regular file-name. In this case the auto-vacuum applies as per normal. 002707 */ 002708 if( zFilename && !isMemdb ){ 002709 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); 002710 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); 002711 } 002712 #endif 002713 nReserve = 0; 002714 }else{ 002715 /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is 002716 ** determined by the one-byte unsigned integer found at an offset of 20 002717 ** into the database file header. */ 002718 nReserve = zDbHeader[20]; 002719 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 002720 #ifndef SQLITE_OMIT_AUTOVACUUM 002721 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); 002722 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); 002723 #endif 002724 } 002725 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 002726 if( rc ) goto btree_open_out; 002727 pBt->usableSize = pBt->pageSize - nReserve; 002728 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ 002729 002730 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002731 /* Add the new BtShared object to the linked list sharable BtShareds. 002732 */ 002733 pBt->nRef = 1; 002734 if( p->sharable ){ 002735 MUTEX_LOGIC( sqlite3_mutex *mutexShared; ) 002736 MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN);) 002737 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){ 002738 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST); 002739 if( pBt->mutex==0 ){ 002740 rc = SQLITE_NOMEM_BKPT; 002741 goto btree_open_out; 002742 } 002743 } 002744 sqlite3_mutex_enter(mutexShared); 002745 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList); 002746 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt; 002747 sqlite3_mutex_leave(mutexShared); 002748 } 002749 #endif 002750 } 002751 002752 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002753 /* If the new Btree uses a sharable pBtShared, then link the new 002754 ** Btree into the list of all sharable Btrees for the same connection. 002755 ** The list is kept in ascending order by pBt address. 002756 */ 002757 if( p->sharable ){ 002758 int i; 002759 Btree *pSib; 002760 for(i=0; i<db->nDb; i++){ 002761 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ 002762 while( pSib->pPrev ){ pSib = pSib->pPrev; } 002763 if( (uptr)p->pBt<(uptr)pSib->pBt ){ 002764 p->pNext = pSib; 002765 p->pPrev = 0; 002766 pSib->pPrev = p; 002767 }else{ 002768 while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){ 002769 pSib = pSib->pNext; 002770 } 002771 p->pNext = pSib->pNext; 002772 p->pPrev = pSib; 002773 if( p->pNext ){ 002774 p->pNext->pPrev = p; 002775 } 002776 pSib->pNext = p; 002777 } 002778 break; 002779 } 002780 } 002781 } 002782 #endif 002783 *ppBtree = p; 002784 002785 btree_open_out: 002786 if( rc!=SQLITE_OK ){ 002787 if( pBt && pBt->pPager ){ 002788 sqlite3PagerClose(pBt->pPager, 0); 002789 } 002790 sqlite3_free(pBt); 002791 sqlite3_free(p); 002792 *ppBtree = 0; 002793 }else{ 002794 sqlite3_file *pFile; 002795 002796 /* If the B-Tree was successfully opened, set the pager-cache size to the 002797 ** default value. Except, when opening on an existing shared pager-cache, 002798 ** do not change the pager-cache size. 002799 */ 002800 if( sqlite3BtreeSchema(p, 0, 0)==0 ){ 002801 sqlite3BtreeSetCacheSize(p, SQLITE_DEFAULT_CACHE_SIZE); 002802 } 002803 002804 pFile = sqlite3PagerFile(pBt->pPager); 002805 if( pFile->pMethods ){ 002806 sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db); 002807 } 002808 } 002809 if( mutexOpen ){ 002810 assert( sqlite3_mutex_held(mutexOpen) ); 002811 sqlite3_mutex_leave(mutexOpen); 002812 } 002813 assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 ); 002814 return rc; 002815 } 002816 002817 /* 002818 ** Decrement the BtShared.nRef counter. When it reaches zero, 002819 ** remove the BtShared structure from the sharing list. Return 002820 ** true if the BtShared.nRef counter reaches zero and return 002821 ** false if it is still positive. 002822 */ 002823 static int removeFromSharingList(BtShared *pBt){ 002824 #ifndef SQLITE_OMIT_SHARED_CACHE 002825 MUTEX_LOGIC( sqlite3_mutex *pMainMtx; ) 002826 BtShared *pList; 002827 int removed = 0; 002828 002829 assert( sqlite3_mutex_notheld(pBt->mutex) ); 002830 MUTEX_LOGIC( pMainMtx = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN); ) 002831 sqlite3_mutex_enter(pMainMtx); 002832 pBt->nRef--; 002833 if( pBt->nRef<=0 ){ 002834 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){ 002835 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext; 002836 }else{ 002837 pList = GLOBAL(BtShared*,sqlite3SharedCacheList); 002838 while( ALWAYS(pList) && pList->pNext!=pBt ){ 002839 pList=pList->pNext; 002840 } 002841 if( ALWAYS(pList) ){ 002842 pList->pNext = pBt->pNext; 002843 } 002844 } 002845 if( SQLITE_THREADSAFE ){ 002846 sqlite3_mutex_free(pBt->mutex); 002847 } 002848 removed = 1; 002849 } 002850 sqlite3_mutex_leave(pMainMtx); 002851 return removed; 002852 #else 002853 return 1; 002854 #endif 002855 } 002856 002857 /* 002858 ** Make sure pBt->pTmpSpace points to an allocation of 002859 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child 002860 ** pointer. 002861 */ 002862 static SQLITE_NOINLINE int allocateTempSpace(BtShared *pBt){ 002863 assert( pBt!=0 ); 002864 assert( pBt->pTmpSpace==0 ); 002865 /* This routine is called only by btreeCursor() when allocating the 002866 ** first write cursor for the BtShared object */ 002867 assert( pBt->pCursor!=0 && (pBt->pCursor->curFlags & BTCF_WriteFlag)!=0 ); 002868 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); 002869 if( pBt->pTmpSpace==0 ){ 002870 BtCursor *pCur = pBt->pCursor; 002871 pBt->pCursor = pCur->pNext; /* Unlink the cursor */ 002872 memset(pCur, 0, sizeof(*pCur)); 002873 return SQLITE_NOMEM_BKPT; 002874 } 002875 002876 /* One of the uses of pBt->pTmpSpace is to format cells before 002877 ** inserting them into a leaf page (function fillInCell()). If 002878 ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes 002879 ** by the various routines that manipulate binary cells. Which 002880 ** can mean that fillInCell() only initializes the first 2 or 3 002881 ** bytes of pTmpSpace, but that the first 4 bytes are copied from 002882 ** it into a database page. This is not actually a problem, but it 002883 ** does cause a valgrind error when the 1 or 2 bytes of uninitialized 002884 ** data is passed to system call write(). So to avoid this error, 002885 ** zero the first 4 bytes of temp space here. 002886 ** 002887 ** Also: Provide four bytes of initialized space before the 002888 ** beginning of pTmpSpace as an area available to prepend the 002889 ** left-child pointer to the beginning of a cell. 002890 */ 002891 memset(pBt->pTmpSpace, 0, 8); 002892 pBt->pTmpSpace += 4; 002893 return SQLITE_OK; 002894 } 002895 002896 /* 002897 ** Free the pBt->pTmpSpace allocation 002898 */ 002899 static void freeTempSpace(BtShared *pBt){ 002900 if( pBt->pTmpSpace ){ 002901 pBt->pTmpSpace -= 4; 002902 sqlite3PageFree(pBt->pTmpSpace); 002903 pBt->pTmpSpace = 0; 002904 } 002905 } 002906 002907 /* 002908 ** Close an open database and invalidate all cursors. 002909 */ 002910 int sqlite3BtreeClose(Btree *p){ 002911 BtShared *pBt = p->pBt; 002912 002913 /* Close all cursors opened via this handle. */ 002914 assert( sqlite3_mutex_held(p->db->mutex) ); 002915 sqlite3BtreeEnter(p); 002916 002917 /* Verify that no other cursors have this Btree open */ 002918 #ifdef SQLITE_DEBUG 002919 { 002920 BtCursor *pCur = pBt->pCursor; 002921 while( pCur ){ 002922 BtCursor *pTmp = pCur; 002923 pCur = pCur->pNext; 002924 assert( pTmp->pBtree!=p ); 002925 002926 } 002927 } 002928 #endif 002929 002930 /* Rollback any active transaction and free the handle structure. 002931 ** The call to sqlite3BtreeRollback() drops any table-locks held by 002932 ** this handle. 002933 */ 002934 sqlite3BtreeRollback(p, SQLITE_OK, 0); 002935 sqlite3BtreeLeave(p); 002936 002937 /* If there are still other outstanding references to the shared-btree 002938 ** structure, return now. The remainder of this procedure cleans 002939 ** up the shared-btree. 002940 */ 002941 assert( p->wantToLock==0 && p->locked==0 ); 002942 if( !p->sharable || removeFromSharingList(pBt) ){ 002943 /* The pBt is no longer on the sharing list, so we can access 002944 ** it without having to hold the mutex. 002945 ** 002946 ** Clean out and delete the BtShared object. 002947 */ 002948 assert( !pBt->pCursor ); 002949 sqlite3PagerClose(pBt->pPager, p->db); 002950 if( pBt->xFreeSchema && pBt->pSchema ){ 002951 pBt->xFreeSchema(pBt->pSchema); 002952 } 002953 sqlite3DbFree(0, pBt->pSchema); 002954 freeTempSpace(pBt); 002955 sqlite3_free(pBt); 002956 } 002957 002958 #ifndef SQLITE_OMIT_SHARED_CACHE 002959 assert( p->wantToLock==0 ); 002960 assert( p->locked==0 ); 002961 if( p->pPrev ) p->pPrev->pNext = p->pNext; 002962 if( p->pNext ) p->pNext->pPrev = p->pPrev; 002963 #endif 002964 002965 sqlite3_free(p); 002966 return SQLITE_OK; 002967 } 002968 002969 /* 002970 ** Change the "soft" limit on the number of pages in the cache. 002971 ** Unused and unmodified pages will be recycled when the number of 002972 ** pages in the cache exceeds this soft limit. But the size of the 002973 ** cache is allowed to grow larger than this limit if it contains 002974 ** dirty pages or pages still in active use. 002975 */ 002976 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ 002977 BtShared *pBt = p->pBt; 002978 assert( sqlite3_mutex_held(p->db->mutex) ); 002979 sqlite3BtreeEnter(p); 002980 sqlite3PagerSetCachesize(pBt->pPager, mxPage); 002981 sqlite3BtreeLeave(p); 002982 return SQLITE_OK; 002983 } 002984 002985 /* 002986 ** Change the "spill" limit on the number of pages in the cache. 002987 ** If the number of pages exceeds this limit during a write transaction, 002988 ** the pager might attempt to "spill" pages to the journal early in 002989 ** order to free up memory. 002990 ** 002991 ** The value returned is the current spill size. If zero is passed 002992 ** as an argument, no changes are made to the spill size setting, so 002993 ** using mxPage of 0 is a way to query the current spill size. 002994 */ 002995 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){ 002996 BtShared *pBt = p->pBt; 002997 int res; 002998 assert( sqlite3_mutex_held(p->db->mutex) ); 002999 sqlite3BtreeEnter(p); 003000 res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage); 003001 sqlite3BtreeLeave(p); 003002 return res; 003003 } 003004 003005 #if SQLITE_MAX_MMAP_SIZE>0 003006 /* 003007 ** Change the limit on the amount of the database file that may be 003008 ** memory mapped. 003009 */ 003010 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){ 003011 BtShared *pBt = p->pBt; 003012 assert( sqlite3_mutex_held(p->db->mutex) ); 003013 sqlite3BtreeEnter(p); 003014 sqlite3PagerSetMmapLimit(pBt->pPager, szMmap); 003015 sqlite3BtreeLeave(p); 003016 return SQLITE_OK; 003017 } 003018 #endif /* SQLITE_MAX_MMAP_SIZE>0 */ 003019 003020 /* 003021 ** Change the way data is synced to disk in order to increase or decrease 003022 ** how well the database resists damage due to OS crashes and power 003023 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 003024 ** there is a high probability of damage) Level 2 is the default. There 003025 ** is a very low but non-zero probability of damage. Level 3 reduces the 003026 ** probability of damage to near zero but with a write performance reduction. 003027 */ 003028 #ifndef SQLITE_OMIT_PAGER_PRAGMAS 003029 int sqlite3BtreeSetPagerFlags( 003030 Btree *p, /* The btree to set the safety level on */ 003031 unsigned pgFlags /* Various PAGER_* flags */ 003032 ){ 003033 BtShared *pBt = p->pBt; 003034 assert( sqlite3_mutex_held(p->db->mutex) ); 003035 sqlite3BtreeEnter(p); 003036 sqlite3PagerSetFlags(pBt->pPager, pgFlags); 003037 sqlite3BtreeLeave(p); 003038 return SQLITE_OK; 003039 } 003040 #endif 003041 003042 /* 003043 ** Change the default pages size and the number of reserved bytes per page. 003044 ** Or, if the page size has already been fixed, return SQLITE_READONLY 003045 ** without changing anything. 003046 ** 003047 ** The page size must be a power of 2 between 512 and 65536. If the page 003048 ** size supplied does not meet this constraint then the page size is not 003049 ** changed. 003050 ** 003051 ** Page sizes are constrained to be a power of two so that the region 003052 ** of the database file used for locking (beginning at PENDING_BYTE, 003053 ** the first byte past the 1GB boundary, 0x40000000) needs to occur 003054 ** at the beginning of a page. 003055 ** 003056 ** If parameter nReserve is less than zero, then the number of reserved 003057 ** bytes per page is left unchanged. 003058 ** 003059 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size 003060 ** and autovacuum mode can no longer be changed. 003061 */ 003062 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ 003063 int rc = SQLITE_OK; 003064 int x; 003065 BtShared *pBt = p->pBt; 003066 assert( nReserve>=0 && nReserve<=255 ); 003067 sqlite3BtreeEnter(p); 003068 pBt->nReserveWanted = nReserve; 003069 x = pBt->pageSize - pBt->usableSize; 003070 if( nReserve<x ) nReserve = x; 003071 if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){ 003072 sqlite3BtreeLeave(p); 003073 return SQLITE_READONLY; 003074 } 003075 assert( nReserve>=0 && nReserve<=255 ); 003076 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && 003077 ((pageSize-1)&pageSize)==0 ){ 003078 assert( (pageSize & 7)==0 ); 003079 assert( !pBt->pCursor ); 003080 if( nReserve>32 && pageSize==512 ) pageSize = 1024; 003081 pBt->pageSize = (u32)pageSize; 003082 freeTempSpace(pBt); 003083 } 003084 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 003085 pBt->usableSize = pBt->pageSize - (u16)nReserve; 003086 if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003087 sqlite3BtreeLeave(p); 003088 return rc; 003089 } 003090 003091 /* 003092 ** Return the currently defined page size 003093 */ 003094 int sqlite3BtreeGetPageSize(Btree *p){ 003095 return p->pBt->pageSize; 003096 } 003097 003098 /* 003099 ** This function is similar to sqlite3BtreeGetReserve(), except that it 003100 ** may only be called if it is guaranteed that the b-tree mutex is already 003101 ** held. 003102 ** 003103 ** This is useful in one special case in the backup API code where it is 003104 ** known that the shared b-tree mutex is held, but the mutex on the 003105 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter() 003106 ** were to be called, it might collide with some other operation on the 003107 ** database handle that owns *p, causing undefined behavior. 003108 */ 003109 int sqlite3BtreeGetReserveNoMutex(Btree *p){ 003110 int n; 003111 assert( sqlite3_mutex_held(p->pBt->mutex) ); 003112 n = p->pBt->pageSize - p->pBt->usableSize; 003113 return n; 003114 } 003115 003116 /* 003117 ** Return the number of bytes of space at the end of every page that 003118 ** are intentionally left unused. This is the "reserved" space that is 003119 ** sometimes used by extensions. 003120 ** 003121 ** The value returned is the larger of the current reserve size and 003122 ** the latest reserve size requested by SQLITE_FILECTRL_RESERVE_BYTES. 003123 ** The amount of reserve can only grow - never shrink. 003124 */ 003125 int sqlite3BtreeGetRequestedReserve(Btree *p){ 003126 int n1, n2; 003127 sqlite3BtreeEnter(p); 003128 n1 = (int)p->pBt->nReserveWanted; 003129 n2 = sqlite3BtreeGetReserveNoMutex(p); 003130 sqlite3BtreeLeave(p); 003131 return n1>n2 ? n1 : n2; 003132 } 003133 003134 003135 /* 003136 ** Set the maximum page count for a database if mxPage is positive. 003137 ** No changes are made if mxPage is 0 or negative. 003138 ** Regardless of the value of mxPage, return the maximum page count. 003139 */ 003140 Pgno sqlite3BtreeMaxPageCount(Btree *p, Pgno mxPage){ 003141 Pgno n; 003142 sqlite3BtreeEnter(p); 003143 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); 003144 sqlite3BtreeLeave(p); 003145 return n; 003146 } 003147 003148 /* 003149 ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags: 003150 ** 003151 ** newFlag==0 Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared 003152 ** newFlag==1 BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared 003153 ** newFlag==2 BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set 003154 ** newFlag==(-1) No changes 003155 ** 003156 ** This routine acts as a query if newFlag is less than zero 003157 ** 003158 ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but 003159 ** freelist leaf pages are not written back to the database. Thus in-page 003160 ** deleted content is cleared, but freelist deleted content is not. 003161 ** 003162 ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition 003163 ** that freelist leaf pages are written back into the database, increasing 003164 ** the amount of disk I/O. 003165 */ 003166 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){ 003167 int b; 003168 if( p==0 ) return 0; 003169 sqlite3BtreeEnter(p); 003170 assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 ); 003171 assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) ); 003172 if( newFlag>=0 ){ 003173 p->pBt->btsFlags &= ~BTS_FAST_SECURE; 003174 p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag; 003175 } 003176 b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE; 003177 sqlite3BtreeLeave(p); 003178 return b; 003179 } 003180 003181 /* 003182 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' 003183 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it 003184 ** is disabled. The default value for the auto-vacuum property is 003185 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. 003186 */ 003187 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ 003188 #ifdef SQLITE_OMIT_AUTOVACUUM 003189 return SQLITE_READONLY; 003190 #else 003191 BtShared *pBt = p->pBt; 003192 int rc = SQLITE_OK; 003193 u8 av = (u8)autoVacuum; 003194 003195 sqlite3BtreeEnter(p); 003196 if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){ 003197 rc = SQLITE_READONLY; 003198 }else{ 003199 pBt->autoVacuum = av ?1:0; 003200 pBt->incrVacuum = av==2 ?1:0; 003201 } 003202 sqlite3BtreeLeave(p); 003203 return rc; 003204 #endif 003205 } 003206 003207 /* 003208 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 003209 ** enabled 1 is returned. Otherwise 0. 003210 */ 003211 int sqlite3BtreeGetAutoVacuum(Btree *p){ 003212 #ifdef SQLITE_OMIT_AUTOVACUUM 003213 return BTREE_AUTOVACUUM_NONE; 003214 #else 003215 int rc; 003216 sqlite3BtreeEnter(p); 003217 rc = ( 003218 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: 003219 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: 003220 BTREE_AUTOVACUUM_INCR 003221 ); 003222 sqlite3BtreeLeave(p); 003223 return rc; 003224 #endif 003225 } 003226 003227 /* 003228 ** If the user has not set the safety-level for this database connection 003229 ** using "PRAGMA synchronous", and if the safety-level is not already 003230 ** set to the value passed to this function as the second parameter, 003231 ** set it so. 003232 */ 003233 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \ 003234 && !defined(SQLITE_OMIT_WAL) 003235 static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){ 003236 sqlite3 *db; 003237 Db *pDb; 003238 if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){ 003239 while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; } 003240 if( pDb->bSyncSet==0 003241 && pDb->safety_level!=safety_level 003242 && pDb!=&db->aDb[1] 003243 ){ 003244 pDb->safety_level = safety_level; 003245 sqlite3PagerSetFlags(pBt->pPager, 003246 pDb->safety_level | (db->flags & PAGER_FLAGS_MASK)); 003247 } 003248 } 003249 } 003250 #else 003251 # define setDefaultSyncFlag(pBt,safety_level) 003252 #endif 003253 003254 /* Forward declaration */ 003255 static int newDatabase(BtShared*); 003256 003257 003258 /* 003259 ** Get a reference to pPage1 of the database file. This will 003260 ** also acquire a readlock on that file. 003261 ** 003262 ** SQLITE_OK is returned on success. If the file is not a 003263 ** well-formed database file, then SQLITE_CORRUPT is returned. 003264 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 003265 ** is returned if we run out of memory. 003266 */ 003267 static int lockBtree(BtShared *pBt){ 003268 int rc; /* Result code from subfunctions */ 003269 MemPage *pPage1; /* Page 1 of the database file */ 003270 u32 nPage; /* Number of pages in the database */ 003271 u32 nPageFile = 0; /* Number of pages in the database file */ 003272 003273 assert( sqlite3_mutex_held(pBt->mutex) ); 003274 assert( pBt->pPage1==0 ); 003275 rc = sqlite3PagerSharedLock(pBt->pPager); 003276 if( rc!=SQLITE_OK ) return rc; 003277 rc = btreeGetPage(pBt, 1, &pPage1, 0); 003278 if( rc!=SQLITE_OK ) return rc; 003279 003280 /* Do some checking to help insure the file we opened really is 003281 ** a valid database file. 003282 */ 003283 nPage = get4byte(28+(u8*)pPage1->aData); 003284 sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile); 003285 if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){ 003286 nPage = nPageFile; 003287 } 003288 if( (pBt->db->flags & SQLITE_ResetDatabase)!=0 ){ 003289 nPage = 0; 003290 } 003291 if( nPage>0 ){ 003292 u32 pageSize; 003293 u32 usableSize; 003294 u8 *page1 = pPage1->aData; 003295 rc = SQLITE_NOTADB; 003296 /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins 003297 ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d 003298 ** 61 74 20 33 00. */ 003299 if( memcmp(page1, zMagicHeader, 16)!=0 ){ 003300 goto page1_init_failed; 003301 } 003302 003303 #ifdef SQLITE_OMIT_WAL 003304 if( page1[18]>1 ){ 003305 pBt->btsFlags |= BTS_READ_ONLY; 003306 } 003307 if( page1[19]>1 ){ 003308 goto page1_init_failed; 003309 } 003310 #else 003311 if( page1[18]>2 ){ 003312 pBt->btsFlags |= BTS_READ_ONLY; 003313 } 003314 if( page1[19]>2 ){ 003315 goto page1_init_failed; 003316 } 003317 003318 /* If the read version is set to 2, this database should be accessed 003319 ** in WAL mode. If the log is not already open, open it now. Then 003320 ** return SQLITE_OK and return without populating BtShared.pPage1. 003321 ** The caller detects this and calls this function again. This is 003322 ** required as the version of page 1 currently in the page1 buffer 003323 ** may not be the latest version - there may be a newer one in the log 003324 ** file. 003325 */ 003326 if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){ 003327 int isOpen = 0; 003328 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen); 003329 if( rc!=SQLITE_OK ){ 003330 goto page1_init_failed; 003331 }else{ 003332 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1); 003333 if( isOpen==0 ){ 003334 releasePageOne(pPage1); 003335 return SQLITE_OK; 003336 } 003337 } 003338 rc = SQLITE_NOTADB; 003339 }else{ 003340 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1); 003341 } 003342 #endif 003343 003344 /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload 003345 ** fractions and the leaf payload fraction values must be 64, 32, and 32. 003346 ** 003347 ** The original design allowed these amounts to vary, but as of 003348 ** version 3.6.0, we require them to be fixed. 003349 */ 003350 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ 003351 goto page1_init_failed; 003352 } 003353 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is 003354 ** determined by the 2-byte integer located at an offset of 16 bytes from 003355 ** the beginning of the database file. */ 003356 pageSize = (page1[16]<<8) | (page1[17]<<16); 003357 /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two 003358 ** between 512 and 65536 inclusive. */ 003359 if( ((pageSize-1)&pageSize)!=0 003360 || pageSize>SQLITE_MAX_PAGE_SIZE 003361 || pageSize<=256 003362 ){ 003363 goto page1_init_failed; 003364 } 003365 assert( (pageSize & 7)==0 ); 003366 /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte 003367 ** integer at offset 20 is the number of bytes of space at the end of 003368 ** each page to reserve for extensions. 003369 ** 003370 ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is 003371 ** determined by the one-byte unsigned integer found at an offset of 20 003372 ** into the database file header. */ 003373 usableSize = pageSize - page1[20]; 003374 if( (u32)pageSize!=pBt->pageSize ){ 003375 /* After reading the first page of the database assuming a page size 003376 ** of BtShared.pageSize, we have discovered that the page-size is 003377 ** actually pageSize. Unlock the database, leave pBt->pPage1 at 003378 ** zero and return SQLITE_OK. The caller will call this function 003379 ** again with the correct page-size. 003380 */ 003381 releasePageOne(pPage1); 003382 pBt->usableSize = usableSize; 003383 pBt->pageSize = pageSize; 003384 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003385 freeTempSpace(pBt); 003386 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, 003387 pageSize-usableSize); 003388 return rc; 003389 } 003390 if( nPage>nPageFile ){ 003391 if( sqlite3WritableSchema(pBt->db)==0 ){ 003392 rc = SQLITE_CORRUPT_BKPT; 003393 goto page1_init_failed; 003394 }else{ 003395 nPage = nPageFile; 003396 } 003397 } 003398 /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to 003399 ** be less than 480. In other words, if the page size is 512, then the 003400 ** reserved space size cannot exceed 32. */ 003401 if( usableSize<480 ){ 003402 goto page1_init_failed; 003403 } 003404 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003405 pBt->pageSize = pageSize; 003406 pBt->usableSize = usableSize; 003407 #ifndef SQLITE_OMIT_AUTOVACUUM 003408 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); 003409 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); 003410 #endif 003411 } 003412 003413 /* maxLocal is the maximum amount of payload to store locally for 003414 ** a cell. Make sure it is small enough so that at least minFanout 003415 ** cells can will fit on one page. We assume a 10-byte page header. 003416 ** Besides the payload, the cell must store: 003417 ** 2-byte pointer to the cell 003418 ** 4-byte child pointer 003419 ** 9-byte nKey value 003420 ** 4-byte nData value 003421 ** 4-byte overflow page pointer 003422 ** So a cell consists of a 2-byte pointer, a header which is as much as 003423 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow 003424 ** page pointer. 003425 */ 003426 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23); 003427 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23); 003428 pBt->maxLeaf = (u16)(pBt->usableSize - 35); 003429 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23); 003430 if( pBt->maxLocal>127 ){ 003431 pBt->max1bytePayload = 127; 003432 }else{ 003433 pBt->max1bytePayload = (u8)pBt->maxLocal; 003434 } 003435 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); 003436 pBt->pPage1 = pPage1; 003437 pBt->nPage = nPage; 003438 return SQLITE_OK; 003439 003440 page1_init_failed: 003441 releasePageOne(pPage1); 003442 pBt->pPage1 = 0; 003443 return rc; 003444 } 003445 003446 #ifndef NDEBUG 003447 /* 003448 ** Return the number of cursors open on pBt. This is for use 003449 ** in assert() expressions, so it is only compiled if NDEBUG is not 003450 ** defined. 003451 ** 003452 ** Only write cursors are counted if wrOnly is true. If wrOnly is 003453 ** false then all cursors are counted. 003454 ** 003455 ** For the purposes of this routine, a cursor is any cursor that 003456 ** is capable of reading or writing to the database. Cursors that 003457 ** have been tripped into the CURSOR_FAULT state are not counted. 003458 */ 003459 static int countValidCursors(BtShared *pBt, int wrOnly){ 003460 BtCursor *pCur; 003461 int r = 0; 003462 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 003463 if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0) 003464 && pCur->eState!=CURSOR_FAULT ) r++; 003465 } 003466 return r; 003467 } 003468 #endif 003469 003470 /* 003471 ** If there are no outstanding cursors and we are not in the middle 003472 ** of a transaction but there is a read lock on the database, then 003473 ** this routine unrefs the first page of the database file which 003474 ** has the effect of releasing the read lock. 003475 ** 003476 ** If there is a transaction in progress, this routine is a no-op. 003477 */ 003478 static void unlockBtreeIfUnused(BtShared *pBt){ 003479 assert( sqlite3_mutex_held(pBt->mutex) ); 003480 assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE ); 003481 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){ 003482 MemPage *pPage1 = pBt->pPage1; 003483 assert( pPage1->aData ); 003484 assert( sqlite3PagerRefcount(pBt->pPager)==1 ); 003485 pBt->pPage1 = 0; 003486 releasePageOne(pPage1); 003487 } 003488 } 003489 003490 /* 003491 ** If pBt points to an empty file then convert that empty file 003492 ** into a new empty database by initializing the first page of 003493 ** the database. 003494 */ 003495 static int newDatabase(BtShared *pBt){ 003496 MemPage *pP1; 003497 unsigned char *data; 003498 int rc; 003499 003500 assert( sqlite3_mutex_held(pBt->mutex) ); 003501 if( pBt->nPage>0 ){ 003502 return SQLITE_OK; 003503 } 003504 pP1 = pBt->pPage1; 003505 assert( pP1!=0 ); 003506 data = pP1->aData; 003507 rc = sqlite3PagerWrite(pP1->pDbPage); 003508 if( rc ) return rc; 003509 memcpy(data, zMagicHeader, sizeof(zMagicHeader)); 003510 assert( sizeof(zMagicHeader)==16 ); 003511 data[16] = (u8)((pBt->pageSize>>8)&0xff); 003512 data[17] = (u8)((pBt->pageSize>>16)&0xff); 003513 data[18] = 1; 003514 data[19] = 1; 003515 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize); 003516 data[20] = (u8)(pBt->pageSize - pBt->usableSize); 003517 data[21] = 64; 003518 data[22] = 32; 003519 data[23] = 32; 003520 memset(&data[24], 0, 100-24); 003521 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); 003522 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003523 #ifndef SQLITE_OMIT_AUTOVACUUM 003524 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); 003525 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); 003526 put4byte(&data[36 + 4*4], pBt->autoVacuum); 003527 put4byte(&data[36 + 7*4], pBt->incrVacuum); 003528 #endif 003529 pBt->nPage = 1; 003530 data[31] = 1; 003531 return SQLITE_OK; 003532 } 003533 003534 /* 003535 ** Initialize the first page of the database file (creating a database 003536 ** consisting of a single page and no schema objects). Return SQLITE_OK 003537 ** if successful, or an SQLite error code otherwise. 003538 */ 003539 int sqlite3BtreeNewDb(Btree *p){ 003540 int rc; 003541 sqlite3BtreeEnter(p); 003542 p->pBt->nPage = 0; 003543 rc = newDatabase(p->pBt); 003544 sqlite3BtreeLeave(p); 003545 return rc; 003546 } 003547 003548 /* 003549 ** Attempt to start a new transaction. A write-transaction 003550 ** is started if the second argument is nonzero, otherwise a read- 003551 ** transaction. If the second argument is 2 or more and exclusive 003552 ** transaction is started, meaning that no other process is allowed 003553 ** to access the database. A preexisting transaction may not be 003554 ** upgraded to exclusive by calling this routine a second time - the 003555 ** exclusivity flag only works for a new transaction. 003556 ** 003557 ** A write-transaction must be started before attempting any 003558 ** changes to the database. None of the following routines 003559 ** will work unless a transaction is started first: 003560 ** 003561 ** sqlite3BtreeCreateTable() 003562 ** sqlite3BtreeCreateIndex() 003563 ** sqlite3BtreeClearTable() 003564 ** sqlite3BtreeDropTable() 003565 ** sqlite3BtreeInsert() 003566 ** sqlite3BtreeDelete() 003567 ** sqlite3BtreeUpdateMeta() 003568 ** 003569 ** If an initial attempt to acquire the lock fails because of lock contention 003570 ** and the database was previously unlocked, then invoke the busy handler 003571 ** if there is one. But if there was previously a read-lock, do not 003572 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is 003573 ** returned when there is already a read-lock in order to avoid a deadlock. 003574 ** 003575 ** Suppose there are two processes A and B. A has a read lock and B has 003576 ** a reserved lock. B tries to promote to exclusive but is blocked because 003577 ** of A's read lock. A tries to promote to reserved but is blocked by B. 003578 ** One or the other of the two processes must give way or there can be 003579 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback 003580 ** when A already has a read lock, we encourage A to give up and let B 003581 ** proceed. 003582 */ 003583 static SQLITE_NOINLINE int btreeBeginTrans( 003584 Btree *p, /* The btree in which to start the transaction */ 003585 int wrflag, /* True to start a write transaction */ 003586 int *pSchemaVersion /* Put schema version number here, if not NULL */ 003587 ){ 003588 BtShared *pBt = p->pBt; 003589 Pager *pPager = pBt->pPager; 003590 int rc = SQLITE_OK; 003591 003592 sqlite3BtreeEnter(p); 003593 btreeIntegrity(p); 003594 003595 /* If the btree is already in a write-transaction, or it 003596 ** is already in a read-transaction and a read-transaction 003597 ** is requested, this is a no-op. 003598 */ 003599 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ 003600 goto trans_begun; 003601 } 003602 assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 ); 003603 003604 if( (p->db->flags & SQLITE_ResetDatabase) 003605 && sqlite3PagerIsreadonly(pPager)==0 003606 ){ 003607 pBt->btsFlags &= ~BTS_READ_ONLY; 003608 } 003609 003610 /* Write transactions are not possible on a read-only database */ 003611 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){ 003612 rc = SQLITE_READONLY; 003613 goto trans_begun; 003614 } 003615 003616 #ifndef SQLITE_OMIT_SHARED_CACHE 003617 { 003618 sqlite3 *pBlock = 0; 003619 /* If another database handle has already opened a write transaction 003620 ** on this shared-btree structure and a second write transaction is 003621 ** requested, return SQLITE_LOCKED. 003622 */ 003623 if( (wrflag && pBt->inTransaction==TRANS_WRITE) 003624 || (pBt->btsFlags & BTS_PENDING)!=0 003625 ){ 003626 pBlock = pBt->pWriter->db; 003627 }else if( wrflag>1 ){ 003628 BtLock *pIter; 003629 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 003630 if( pIter->pBtree!=p ){ 003631 pBlock = pIter->pBtree->db; 003632 break; 003633 } 003634 } 003635 } 003636 if( pBlock ){ 003637 sqlite3ConnectionBlocked(p->db, pBlock); 003638 rc = SQLITE_LOCKED_SHAREDCACHE; 003639 goto trans_begun; 003640 } 003641 } 003642 #endif 003643 003644 /* Any read-only or read-write transaction implies a read-lock on 003645 ** page 1. So if some other shared-cache client already has a write-lock 003646 ** on page 1, the transaction cannot be opened. */ 003647 rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK); 003648 if( SQLITE_OK!=rc ) goto trans_begun; 003649 003650 pBt->btsFlags &= ~BTS_INITIALLY_EMPTY; 003651 if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY; 003652 do { 003653 sqlite3PagerWalDb(pPager, p->db); 003654 003655 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT 003656 /* If transitioning from no transaction directly to a write transaction, 003657 ** block for the WRITER lock first if possible. */ 003658 if( pBt->pPage1==0 && wrflag ){ 003659 assert( pBt->inTransaction==TRANS_NONE ); 003660 rc = sqlite3PagerWalWriteLock(pPager, 1); 003661 if( rc!=SQLITE_BUSY && rc!=SQLITE_OK ) break; 003662 } 003663 #endif 003664 003665 /* Call lockBtree() until either pBt->pPage1 is populated or 003666 ** lockBtree() returns something other than SQLITE_OK. lockBtree() 003667 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after 003668 ** reading page 1 it discovers that the page-size of the database 003669 ** file is not pBt->pageSize. In this case lockBtree() will update 003670 ** pBt->pageSize to the page-size of the file on disk. 003671 */ 003672 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) ); 003673 003674 if( rc==SQLITE_OK && wrflag ){ 003675 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){ 003676 rc = SQLITE_READONLY; 003677 }else{ 003678 rc = sqlite3PagerBegin(pPager, wrflag>1, sqlite3TempInMemory(p->db)); 003679 if( rc==SQLITE_OK ){ 003680 rc = newDatabase(pBt); 003681 }else if( rc==SQLITE_BUSY_SNAPSHOT && pBt->inTransaction==TRANS_NONE ){ 003682 /* if there was no transaction opened when this function was 003683 ** called and SQLITE_BUSY_SNAPSHOT is returned, change the error 003684 ** code to SQLITE_BUSY. */ 003685 rc = SQLITE_BUSY; 003686 } 003687 } 003688 } 003689 003690 if( rc!=SQLITE_OK ){ 003691 (void)sqlite3PagerWalWriteLock(pPager, 0); 003692 unlockBtreeIfUnused(pBt); 003693 } 003694 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && 003695 btreeInvokeBusyHandler(pBt) ); 003696 sqlite3PagerWalDb(pPager, 0); 003697 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT 003698 if( rc==SQLITE_BUSY_TIMEOUT ) rc = SQLITE_BUSY; 003699 #endif 003700 003701 if( rc==SQLITE_OK ){ 003702 if( p->inTrans==TRANS_NONE ){ 003703 pBt->nTransaction++; 003704 #ifndef SQLITE_OMIT_SHARED_CACHE 003705 if( p->sharable ){ 003706 assert( p->lock.pBtree==p && p->lock.iTable==1 ); 003707 p->lock.eLock = READ_LOCK; 003708 p->lock.pNext = pBt->pLock; 003709 pBt->pLock = &p->lock; 003710 } 003711 #endif 003712 } 003713 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); 003714 if( p->inTrans>pBt->inTransaction ){ 003715 pBt->inTransaction = p->inTrans; 003716 } 003717 if( wrflag ){ 003718 MemPage *pPage1 = pBt->pPage1; 003719 #ifndef SQLITE_OMIT_SHARED_CACHE 003720 assert( !pBt->pWriter ); 003721 pBt->pWriter = p; 003722 pBt->btsFlags &= ~BTS_EXCLUSIVE; 003723 if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE; 003724 #endif 003725 003726 /* If the db-size header field is incorrect (as it may be if an old 003727 ** client has been writing the database file), update it now. Doing 003728 ** this sooner rather than later means the database size can safely 003729 ** re-read the database size from page 1 if a savepoint or transaction 003730 ** rollback occurs within the transaction. 003731 */ 003732 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){ 003733 rc = sqlite3PagerWrite(pPage1->pDbPage); 003734 if( rc==SQLITE_OK ){ 003735 put4byte(&pPage1->aData[28], pBt->nPage); 003736 } 003737 } 003738 } 003739 } 003740 003741 trans_begun: 003742 if( rc==SQLITE_OK ){ 003743 if( pSchemaVersion ){ 003744 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]); 003745 } 003746 if( wrflag ){ 003747 /* This call makes sure that the pager has the correct number of 003748 ** open savepoints. If the second parameter is greater than 0 and 003749 ** the sub-journal is not already open, then it will be opened here. 003750 */ 003751 rc = sqlite3PagerOpenSavepoint(pPager, p->db->nSavepoint); 003752 } 003753 } 003754 003755 btreeIntegrity(p); 003756 sqlite3BtreeLeave(p); 003757 return rc; 003758 } 003759 int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ 003760 BtShared *pBt; 003761 if( p->sharable 003762 || p->inTrans==TRANS_NONE 003763 || (p->inTrans==TRANS_READ && wrflag!=0) 003764 ){ 003765 return btreeBeginTrans(p,wrflag,pSchemaVersion); 003766 } 003767 pBt = p->pBt; 003768 if( pSchemaVersion ){ 003769 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]); 003770 } 003771 if( wrflag ){ 003772 /* This call makes sure that the pager has the correct number of 003773 ** open savepoints. If the second parameter is greater than 0 and 003774 ** the sub-journal is not already open, then it will be opened here. 003775 */ 003776 return sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint); 003777 }else{ 003778 return SQLITE_OK; 003779 } 003780 } 003781 003782 #ifndef SQLITE_OMIT_AUTOVACUUM 003783 003784 /* 003785 ** Set the pointer-map entries for all children of page pPage. Also, if 003786 ** pPage contains cells that point to overflow pages, set the pointer 003787 ** map entries for the overflow pages as well. 003788 */ 003789 static int setChildPtrmaps(MemPage *pPage){ 003790 int i; /* Counter variable */ 003791 int nCell; /* Number of cells in page pPage */ 003792 int rc; /* Return code */ 003793 BtShared *pBt = pPage->pBt; 003794 Pgno pgno = pPage->pgno; 003795 003796 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 003797 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage); 003798 if( rc!=SQLITE_OK ) return rc; 003799 nCell = pPage->nCell; 003800 003801 for(i=0; i<nCell; i++){ 003802 u8 *pCell = findCell(pPage, i); 003803 003804 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc); 003805 003806 if( !pPage->leaf ){ 003807 Pgno childPgno = get4byte(pCell); 003808 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 003809 } 003810 } 003811 003812 if( !pPage->leaf ){ 003813 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 003814 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 003815 } 003816 003817 return rc; 003818 } 003819 003820 /* 003821 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so 003822 ** that it points to iTo. Parameter eType describes the type of pointer to 003823 ** be modified, as follows: 003824 ** 003825 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child 003826 ** page of pPage. 003827 ** 003828 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow 003829 ** page pointed to by one of the cells on pPage. 003830 ** 003831 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next 003832 ** overflow page in the list. 003833 */ 003834 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ 003835 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 003836 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 003837 if( eType==PTRMAP_OVERFLOW2 ){ 003838 /* The pointer is always the first 4 bytes of the page in this case. */ 003839 if( get4byte(pPage->aData)!=iFrom ){ 003840 return SQLITE_CORRUPT_PAGE(pPage); 003841 } 003842 put4byte(pPage->aData, iTo); 003843 }else{ 003844 int i; 003845 int nCell; 003846 int rc; 003847 003848 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage); 003849 if( rc ) return rc; 003850 nCell = pPage->nCell; 003851 003852 for(i=0; i<nCell; i++){ 003853 u8 *pCell = findCell(pPage, i); 003854 if( eType==PTRMAP_OVERFLOW1 ){ 003855 CellInfo info; 003856 pPage->xParseCell(pPage, pCell, &info); 003857 if( info.nLocal<info.nPayload ){ 003858 if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){ 003859 return SQLITE_CORRUPT_PAGE(pPage); 003860 } 003861 if( iFrom==get4byte(pCell+info.nSize-4) ){ 003862 put4byte(pCell+info.nSize-4, iTo); 003863 break; 003864 } 003865 } 003866 }else{ 003867 if( pCell+4 > pPage->aData+pPage->pBt->usableSize ){ 003868 return SQLITE_CORRUPT_PAGE(pPage); 003869 } 003870 if( get4byte(pCell)==iFrom ){ 003871 put4byte(pCell, iTo); 003872 break; 003873 } 003874 } 003875 } 003876 003877 if( i==nCell ){ 003878 if( eType!=PTRMAP_BTREE || 003879 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ 003880 return SQLITE_CORRUPT_PAGE(pPage); 003881 } 003882 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); 003883 } 003884 } 003885 return SQLITE_OK; 003886 } 003887 003888 003889 /* 003890 ** Move the open database page pDbPage to location iFreePage in the 003891 ** database. The pDbPage reference remains valid. 003892 ** 003893 ** The isCommit flag indicates that there is no need to remember that 003894 ** the journal needs to be sync()ed before database page pDbPage->pgno 003895 ** can be written to. The caller has already promised not to write to that 003896 ** page. 003897 */ 003898 static int relocatePage( 003899 BtShared *pBt, /* Btree */ 003900 MemPage *pDbPage, /* Open page to move */ 003901 u8 eType, /* Pointer map 'type' entry for pDbPage */ 003902 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ 003903 Pgno iFreePage, /* The location to move pDbPage to */ 003904 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */ 003905 ){ 003906 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ 003907 Pgno iDbPage = pDbPage->pgno; 003908 Pager *pPager = pBt->pPager; 003909 int rc; 003910 003911 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 003912 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); 003913 assert( sqlite3_mutex_held(pBt->mutex) ); 003914 assert( pDbPage->pBt==pBt ); 003915 if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT; 003916 003917 /* Move page iDbPage from its current location to page number iFreePage */ 003918 TRACE(("AUTOVACUUM: Moving %u to free page %u (ptr page %u type %u)\n", 003919 iDbPage, iFreePage, iPtrPage, eType)); 003920 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit); 003921 if( rc!=SQLITE_OK ){ 003922 return rc; 003923 } 003924 pDbPage->pgno = iFreePage; 003925 003926 /* If pDbPage was a btree-page, then it may have child pages and/or cells 003927 ** that point to overflow pages. The pointer map entries for all these 003928 ** pages need to be changed. 003929 ** 003930 ** If pDbPage is an overflow page, then the first 4 bytes may store a 003931 ** pointer to a subsequent overflow page. If this is the case, then 003932 ** the pointer map needs to be updated for the subsequent overflow page. 003933 */ 003934 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ 003935 rc = setChildPtrmaps(pDbPage); 003936 if( rc!=SQLITE_OK ){ 003937 return rc; 003938 } 003939 }else{ 003940 Pgno nextOvfl = get4byte(pDbPage->aData); 003941 if( nextOvfl!=0 ){ 003942 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc); 003943 if( rc!=SQLITE_OK ){ 003944 return rc; 003945 } 003946 } 003947 } 003948 003949 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so 003950 ** that it points at iFreePage. Also fix the pointer map entry for 003951 ** iPtrPage. 003952 */ 003953 if( eType!=PTRMAP_ROOTPAGE ){ 003954 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0); 003955 if( rc!=SQLITE_OK ){ 003956 return rc; 003957 } 003958 rc = sqlite3PagerWrite(pPtrPage->pDbPage); 003959 if( rc!=SQLITE_OK ){ 003960 releasePage(pPtrPage); 003961 return rc; 003962 } 003963 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); 003964 releasePage(pPtrPage); 003965 if( rc==SQLITE_OK ){ 003966 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc); 003967 } 003968 } 003969 return rc; 003970 } 003971 003972 /* Forward declaration required by incrVacuumStep(). */ 003973 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); 003974 003975 /* 003976 ** Perform a single step of an incremental-vacuum. If successful, return 003977 ** SQLITE_OK. If there is no work to do (and therefore no point in 003978 ** calling this function again), return SQLITE_DONE. Or, if an error 003979 ** occurs, return some other error code. 003980 ** 003981 ** More specifically, this function attempts to re-organize the database so 003982 ** that the last page of the file currently in use is no longer in use. 003983 ** 003984 ** Parameter nFin is the number of pages that this database would contain 003985 ** were this function called until it returns SQLITE_DONE. 003986 ** 003987 ** If the bCommit parameter is non-zero, this function assumes that the 003988 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE 003989 ** or an error. bCommit is passed true for an auto-vacuum-on-commit 003990 ** operation, or false for an incremental vacuum. 003991 */ 003992 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){ 003993 Pgno nFreeList; /* Number of pages still on the free-list */ 003994 int rc; 003995 003996 assert( sqlite3_mutex_held(pBt->mutex) ); 003997 assert( iLastPg>nFin ); 003998 003999 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ 004000 u8 eType; 004001 Pgno iPtrPage; 004002 004003 nFreeList = get4byte(&pBt->pPage1->aData[36]); 004004 if( nFreeList==0 ){ 004005 return SQLITE_DONE; 004006 } 004007 004008 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); 004009 if( rc!=SQLITE_OK ){ 004010 return rc; 004011 } 004012 if( eType==PTRMAP_ROOTPAGE ){ 004013 return SQLITE_CORRUPT_BKPT; 004014 } 004015 004016 if( eType==PTRMAP_FREEPAGE ){ 004017 if( bCommit==0 ){ 004018 /* Remove the page from the files free-list. This is not required 004019 ** if bCommit is non-zero. In that case, the free-list will be 004020 ** truncated to zero after this function returns, so it doesn't 004021 ** matter if it still contains some garbage entries. 004022 */ 004023 Pgno iFreePg; 004024 MemPage *pFreePg; 004025 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT); 004026 if( rc!=SQLITE_OK ){ 004027 return rc; 004028 } 004029 assert( iFreePg==iLastPg ); 004030 releasePage(pFreePg); 004031 } 004032 } else { 004033 Pgno iFreePg; /* Index of free page to move pLastPg to */ 004034 MemPage *pLastPg; 004035 u8 eMode = BTALLOC_ANY; /* Mode parameter for allocateBtreePage() */ 004036 Pgno iNear = 0; /* nearby parameter for allocateBtreePage() */ 004037 004038 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0); 004039 if( rc!=SQLITE_OK ){ 004040 return rc; 004041 } 004042 004043 /* If bCommit is zero, this loop runs exactly once and page pLastPg 004044 ** is swapped with the first free page pulled off the free list. 004045 ** 004046 ** On the other hand, if bCommit is greater than zero, then keep 004047 ** looping until a free-page located within the first nFin pages 004048 ** of the file is found. 004049 */ 004050 if( bCommit==0 ){ 004051 eMode = BTALLOC_LE; 004052 iNear = nFin; 004053 } 004054 do { 004055 MemPage *pFreePg; 004056 Pgno dbSize = btreePagecount(pBt); 004057 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode); 004058 if( rc!=SQLITE_OK ){ 004059 releasePage(pLastPg); 004060 return rc; 004061 } 004062 releasePage(pFreePg); 004063 if( iFreePg>dbSize ){ 004064 releasePage(pLastPg); 004065 return SQLITE_CORRUPT_BKPT; 004066 } 004067 }while( bCommit && iFreePg>nFin ); 004068 assert( iFreePg<iLastPg ); 004069 004070 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit); 004071 releasePage(pLastPg); 004072 if( rc!=SQLITE_OK ){ 004073 return rc; 004074 } 004075 } 004076 } 004077 004078 if( bCommit==0 ){ 004079 do { 004080 iLastPg--; 004081 }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) ); 004082 pBt->bDoTruncate = 1; 004083 pBt->nPage = iLastPg; 004084 } 004085 return SQLITE_OK; 004086 } 004087 004088 /* 004089 ** The database opened by the first argument is an auto-vacuum database 004090 ** nOrig pages in size containing nFree free pages. Return the expected 004091 ** size of the database in pages following an auto-vacuum operation. 004092 */ 004093 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){ 004094 int nEntry; /* Number of entries on one ptrmap page */ 004095 Pgno nPtrmap; /* Number of PtrMap pages to be freed */ 004096 Pgno nFin; /* Return value */ 004097 004098 nEntry = pBt->usableSize/5; 004099 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry; 004100 nFin = nOrig - nFree - nPtrmap; 004101 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){ 004102 nFin--; 004103 } 004104 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ 004105 nFin--; 004106 } 004107 004108 return nFin; 004109 } 004110 004111 /* 004112 ** A write-transaction must be opened before calling this function. 004113 ** It performs a single unit of work towards an incremental vacuum. 004114 ** 004115 ** If the incremental vacuum is finished after this function has run, 004116 ** SQLITE_DONE is returned. If it is not finished, but no error occurred, 004117 ** SQLITE_OK is returned. Otherwise an SQLite error code. 004118 */ 004119 int sqlite3BtreeIncrVacuum(Btree *p){ 004120 int rc; 004121 BtShared *pBt = p->pBt; 004122 004123 sqlite3BtreeEnter(p); 004124 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); 004125 if( !pBt->autoVacuum ){ 004126 rc = SQLITE_DONE; 004127 }else{ 004128 Pgno nOrig = btreePagecount(pBt); 004129 Pgno nFree = get4byte(&pBt->pPage1->aData[36]); 004130 Pgno nFin = finalDbSize(pBt, nOrig, nFree); 004131 004132 if( nOrig<nFin || nFree>=nOrig ){ 004133 rc = SQLITE_CORRUPT_BKPT; 004134 }else if( nFree>0 ){ 004135 rc = saveAllCursors(pBt, 0, 0); 004136 if( rc==SQLITE_OK ){ 004137 invalidateAllOverflowCache(pBt); 004138 rc = incrVacuumStep(pBt, nFin, nOrig, 0); 004139 } 004140 if( rc==SQLITE_OK ){ 004141 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 004142 put4byte(&pBt->pPage1->aData[28], pBt->nPage); 004143 } 004144 }else{ 004145 rc = SQLITE_DONE; 004146 } 004147 } 004148 sqlite3BtreeLeave(p); 004149 return rc; 004150 } 004151 004152 /* 004153 ** This routine is called prior to sqlite3PagerCommit when a transaction 004154 ** is committed for an auto-vacuum database. 004155 */ 004156 static int autoVacuumCommit(Btree *p){ 004157 int rc = SQLITE_OK; 004158 Pager *pPager; 004159 BtShared *pBt; 004160 sqlite3 *db; 004161 VVA_ONLY( int nRef ); 004162 004163 assert( p!=0 ); 004164 pBt = p->pBt; 004165 pPager = pBt->pPager; 004166 VVA_ONLY( nRef = sqlite3PagerRefcount(pPager); ) 004167 004168 assert( sqlite3_mutex_held(pBt->mutex) ); 004169 invalidateAllOverflowCache(pBt); 004170 assert(pBt->autoVacuum); 004171 if( !pBt->incrVacuum ){ 004172 Pgno nFin; /* Number of pages in database after autovacuuming */ 004173 Pgno nFree; /* Number of pages on the freelist initially */ 004174 Pgno nVac; /* Number of pages to vacuum */ 004175 Pgno iFree; /* The next page to be freed */ 004176 Pgno nOrig; /* Database size before freeing */ 004177 004178 nOrig = btreePagecount(pBt); 004179 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){ 004180 /* It is not possible to create a database for which the final page 004181 ** is either a pointer-map page or the pending-byte page. If one 004182 ** is encountered, this indicates corruption. 004183 */ 004184 return SQLITE_CORRUPT_BKPT; 004185 } 004186 004187 nFree = get4byte(&pBt->pPage1->aData[36]); 004188 db = p->db; 004189 if( db->xAutovacPages ){ 004190 int iDb; 004191 for(iDb=0; ALWAYS(iDb<db->nDb); iDb++){ 004192 if( db->aDb[iDb].pBt==p ) break; 004193 } 004194 nVac = db->xAutovacPages( 004195 db->pAutovacPagesArg, 004196 db->aDb[iDb].zDbSName, 004197 nOrig, 004198 nFree, 004199 pBt->pageSize 004200 ); 004201 if( nVac>nFree ){ 004202 nVac = nFree; 004203 } 004204 if( nVac==0 ){ 004205 return SQLITE_OK; 004206 } 004207 }else{ 004208 nVac = nFree; 004209 } 004210 nFin = finalDbSize(pBt, nOrig, nVac); 004211 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT; 004212 if( nFin<nOrig ){ 004213 rc = saveAllCursors(pBt, 0, 0); 004214 } 004215 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){ 004216 rc = incrVacuumStep(pBt, nFin, iFree, nVac==nFree); 004217 } 004218 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){ 004219 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 004220 if( nVac==nFree ){ 004221 put4byte(&pBt->pPage1->aData[32], 0); 004222 put4byte(&pBt->pPage1->aData[36], 0); 004223 } 004224 put4byte(&pBt->pPage1->aData[28], nFin); 004225 pBt->bDoTruncate = 1; 004226 pBt->nPage = nFin; 004227 } 004228 if( rc!=SQLITE_OK ){ 004229 sqlite3PagerRollback(pPager); 004230 } 004231 } 004232 004233 assert( nRef>=sqlite3PagerRefcount(pPager) ); 004234 return rc; 004235 } 004236 004237 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */ 004238 # define setChildPtrmaps(x) SQLITE_OK 004239 #endif 004240 004241 /* 004242 ** This routine does the first phase of a two-phase commit. This routine 004243 ** causes a rollback journal to be created (if it does not already exist) 004244 ** and populated with enough information so that if a power loss occurs 004245 ** the database can be restored to its original state by playing back 004246 ** the journal. Then the contents of the journal are flushed out to 004247 ** the disk. After the journal is safely on oxide, the changes to the 004248 ** database are written into the database file and flushed to oxide. 004249 ** At the end of this call, the rollback journal still exists on the 004250 ** disk and we are still holding all locks, so the transaction has not 004251 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the 004252 ** commit process. 004253 ** 004254 ** This call is a no-op if no write-transaction is currently active on pBt. 004255 ** 004256 ** Otherwise, sync the database file for the btree pBt. zSuperJrnl points to 004257 ** the name of a super-journal file that should be written into the 004258 ** individual journal file, or is NULL, indicating no super-journal file 004259 ** (single database transaction). 004260 ** 004261 ** When this is called, the super-journal should already have been 004262 ** created, populated with this journal pointer and synced to disk. 004263 ** 004264 ** Once this is routine has returned, the only thing required to commit 004265 ** the write-transaction for this database file is to delete the journal. 004266 */ 004267 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zSuperJrnl){ 004268 int rc = SQLITE_OK; 004269 if( p->inTrans==TRANS_WRITE ){ 004270 BtShared *pBt = p->pBt; 004271 sqlite3BtreeEnter(p); 004272 #ifndef SQLITE_OMIT_AUTOVACUUM 004273 if( pBt->autoVacuum ){ 004274 rc = autoVacuumCommit(p); 004275 if( rc!=SQLITE_OK ){ 004276 sqlite3BtreeLeave(p); 004277 return rc; 004278 } 004279 } 004280 if( pBt->bDoTruncate ){ 004281 sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage); 004282 } 004283 #endif 004284 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zSuperJrnl, 0); 004285 sqlite3BtreeLeave(p); 004286 } 004287 return rc; 004288 } 004289 004290 /* 004291 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback() 004292 ** at the conclusion of a transaction. 004293 */ 004294 static void btreeEndTransaction(Btree *p){ 004295 BtShared *pBt = p->pBt; 004296 sqlite3 *db = p->db; 004297 assert( sqlite3BtreeHoldsMutex(p) ); 004298 004299 #ifndef SQLITE_OMIT_AUTOVACUUM 004300 pBt->bDoTruncate = 0; 004301 #endif 004302 if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){ 004303 /* If there are other active statements that belong to this database 004304 ** handle, downgrade to a read-only transaction. The other statements 004305 ** may still be reading from the database. */ 004306 downgradeAllSharedCacheTableLocks(p); 004307 p->inTrans = TRANS_READ; 004308 }else{ 004309 /* If the handle had any kind of transaction open, decrement the 004310 ** transaction count of the shared btree. If the transaction count 004311 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused() 004312 ** call below will unlock the pager. */ 004313 if( p->inTrans!=TRANS_NONE ){ 004314 clearAllSharedCacheTableLocks(p); 004315 pBt->nTransaction--; 004316 if( 0==pBt->nTransaction ){ 004317 pBt->inTransaction = TRANS_NONE; 004318 } 004319 } 004320 004321 /* Set the current transaction state to TRANS_NONE and unlock the 004322 ** pager if this call closed the only read or write transaction. */ 004323 p->inTrans = TRANS_NONE; 004324 unlockBtreeIfUnused(pBt); 004325 } 004326 004327 btreeIntegrity(p); 004328 } 004329 004330 /* 004331 ** Commit the transaction currently in progress. 004332 ** 004333 ** This routine implements the second phase of a 2-phase commit. The 004334 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should 004335 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne() 004336 ** routine did all the work of writing information out to disk and flushing the 004337 ** contents so that they are written onto the disk platter. All this 004338 ** routine has to do is delete or truncate or zero the header in the 004339 ** the rollback journal (which causes the transaction to commit) and 004340 ** drop locks. 004341 ** 004342 ** Normally, if an error occurs while the pager layer is attempting to 004343 ** finalize the underlying journal file, this function returns an error and 004344 ** the upper layer will attempt a rollback. However, if the second argument 004345 ** is non-zero then this b-tree transaction is part of a multi-file 004346 ** transaction. In this case, the transaction has already been committed 004347 ** (by deleting a super-journal file) and the caller will ignore this 004348 ** functions return code. So, even if an error occurs in the pager layer, 004349 ** reset the b-tree objects internal state to indicate that the write 004350 ** transaction has been closed. This is quite safe, as the pager will have 004351 ** transitioned to the error state. 004352 ** 004353 ** This will release the write lock on the database file. If there 004354 ** are no active cursors, it also releases the read lock. 004355 */ 004356 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){ 004357 004358 if( p->inTrans==TRANS_NONE ) return SQLITE_OK; 004359 sqlite3BtreeEnter(p); 004360 btreeIntegrity(p); 004361 004362 /* If the handle has a write-transaction open, commit the shared-btrees 004363 ** transaction and set the shared state to TRANS_READ. 004364 */ 004365 if( p->inTrans==TRANS_WRITE ){ 004366 int rc; 004367 BtShared *pBt = p->pBt; 004368 assert( pBt->inTransaction==TRANS_WRITE ); 004369 assert( pBt->nTransaction>0 ); 004370 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); 004371 if( rc!=SQLITE_OK && bCleanup==0 ){ 004372 sqlite3BtreeLeave(p); 004373 return rc; 004374 } 004375 p->iBDataVersion--; /* Compensate for pPager->iDataVersion++; */ 004376 pBt->inTransaction = TRANS_READ; 004377 btreeClearHasContent(pBt); 004378 } 004379 004380 btreeEndTransaction(p); 004381 sqlite3BtreeLeave(p); 004382 return SQLITE_OK; 004383 } 004384 004385 /* 004386 ** Do both phases of a commit. 004387 */ 004388 int sqlite3BtreeCommit(Btree *p){ 004389 int rc; 004390 sqlite3BtreeEnter(p); 004391 rc = sqlite3BtreeCommitPhaseOne(p, 0); 004392 if( rc==SQLITE_OK ){ 004393 rc = sqlite3BtreeCommitPhaseTwo(p, 0); 004394 } 004395 sqlite3BtreeLeave(p); 004396 return rc; 004397 } 004398 004399 /* 004400 ** This routine sets the state to CURSOR_FAULT and the error 004401 ** code to errCode for every cursor on any BtShared that pBtree 004402 ** references. Or if the writeOnly flag is set to 1, then only 004403 ** trip write cursors and leave read cursors unchanged. 004404 ** 004405 ** Every cursor is a candidate to be tripped, including cursors 004406 ** that belong to other database connections that happen to be 004407 ** sharing the cache with pBtree. 004408 ** 004409 ** This routine gets called when a rollback occurs. If the writeOnly 004410 ** flag is true, then only write-cursors need be tripped - read-only 004411 ** cursors save their current positions so that they may continue 004412 ** following the rollback. Or, if writeOnly is false, all cursors are 004413 ** tripped. In general, writeOnly is false if the transaction being 004414 ** rolled back modified the database schema. In this case b-tree root 004415 ** pages may be moved or deleted from the database altogether, making 004416 ** it unsafe for read cursors to continue. 004417 ** 004418 ** If the writeOnly flag is true and an error is encountered while 004419 ** saving the current position of a read-only cursor, all cursors, 004420 ** including all read-cursors are tripped. 004421 ** 004422 ** SQLITE_OK is returned if successful, or if an error occurs while 004423 ** saving a cursor position, an SQLite error code. 004424 */ 004425 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){ 004426 BtCursor *p; 004427 int rc = SQLITE_OK; 004428 004429 assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 ); 004430 if( pBtree ){ 004431 sqlite3BtreeEnter(pBtree); 004432 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 004433 if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){ 004434 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){ 004435 rc = saveCursorPosition(p); 004436 if( rc!=SQLITE_OK ){ 004437 (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0); 004438 break; 004439 } 004440 } 004441 }else{ 004442 sqlite3BtreeClearCursor(p); 004443 p->eState = CURSOR_FAULT; 004444 p->skipNext = errCode; 004445 } 004446 btreeReleaseAllCursorPages(p); 004447 } 004448 sqlite3BtreeLeave(pBtree); 004449 } 004450 return rc; 004451 } 004452 004453 /* 004454 ** Set the pBt->nPage field correctly, according to the current 004455 ** state of the database. Assume pBt->pPage1 is valid. 004456 */ 004457 static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){ 004458 int nPage = get4byte(&pPage1->aData[28]); 004459 testcase( nPage==0 ); 004460 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage); 004461 testcase( pBt->nPage!=(u32)nPage ); 004462 pBt->nPage = nPage; 004463 } 004464 004465 /* 004466 ** Rollback the transaction in progress. 004467 ** 004468 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped). 004469 ** Only write cursors are tripped if writeOnly is true but all cursors are 004470 ** tripped if writeOnly is false. Any attempt to use 004471 ** a tripped cursor will result in an error. 004472 ** 004473 ** This will release the write lock on the database file. If there 004474 ** are no active cursors, it also releases the read lock. 004475 */ 004476 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){ 004477 int rc; 004478 BtShared *pBt = p->pBt; 004479 MemPage *pPage1; 004480 004481 assert( writeOnly==1 || writeOnly==0 ); 004482 assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK ); 004483 sqlite3BtreeEnter(p); 004484 if( tripCode==SQLITE_OK ){ 004485 rc = tripCode = saveAllCursors(pBt, 0, 0); 004486 if( rc ) writeOnly = 0; 004487 }else{ 004488 rc = SQLITE_OK; 004489 } 004490 if( tripCode ){ 004491 int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly); 004492 assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) ); 004493 if( rc2!=SQLITE_OK ) rc = rc2; 004494 } 004495 btreeIntegrity(p); 004496 004497 if( p->inTrans==TRANS_WRITE ){ 004498 int rc2; 004499 004500 assert( TRANS_WRITE==pBt->inTransaction ); 004501 rc2 = sqlite3PagerRollback(pBt->pPager); 004502 if( rc2!=SQLITE_OK ){ 004503 rc = rc2; 004504 } 004505 004506 /* The rollback may have destroyed the pPage1->aData value. So 004507 ** call btreeGetPage() on page 1 again to make 004508 ** sure pPage1->aData is set correctly. */ 004509 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ 004510 btreeSetNPage(pBt, pPage1); 004511 releasePageOne(pPage1); 004512 } 004513 assert( countValidCursors(pBt, 1)==0 ); 004514 pBt->inTransaction = TRANS_READ; 004515 btreeClearHasContent(pBt); 004516 } 004517 004518 btreeEndTransaction(p); 004519 sqlite3BtreeLeave(p); 004520 return rc; 004521 } 004522 004523 /* 004524 ** Start a statement subtransaction. The subtransaction can be rolled 004525 ** back independently of the main transaction. You must start a transaction 004526 ** before starting a subtransaction. The subtransaction is ended automatically 004527 ** if the main transaction commits or rolls back. 004528 ** 004529 ** Statement subtransactions are used around individual SQL statements 004530 ** that are contained within a BEGIN...COMMIT block. If a constraint 004531 ** error occurs within the statement, the effect of that one statement 004532 ** can be rolled back without having to rollback the entire transaction. 004533 ** 004534 ** A statement sub-transaction is implemented as an anonymous savepoint. The 004535 ** value passed as the second parameter is the total number of savepoints, 004536 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there 004537 ** are no active savepoints and no other statement-transactions open, 004538 ** iStatement is 1. This anonymous savepoint can be released or rolled back 004539 ** using the sqlite3BtreeSavepoint() function. 004540 */ 004541 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){ 004542 int rc; 004543 BtShared *pBt = p->pBt; 004544 sqlite3BtreeEnter(p); 004545 assert( p->inTrans==TRANS_WRITE ); 004546 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 004547 assert( iStatement>0 ); 004548 assert( iStatement>p->db->nSavepoint ); 004549 assert( pBt->inTransaction==TRANS_WRITE ); 004550 /* At the pager level, a statement transaction is a savepoint with 004551 ** an index greater than all savepoints created explicitly using 004552 ** SQL statements. It is illegal to open, release or rollback any 004553 ** such savepoints while the statement transaction savepoint is active. 004554 */ 004555 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement); 004556 sqlite3BtreeLeave(p); 004557 return rc; 004558 } 004559 004560 /* 004561 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK 004562 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the 004563 ** savepoint identified by parameter iSavepoint, depending on the value 004564 ** of op. 004565 ** 004566 ** Normally, iSavepoint is greater than or equal to zero. However, if op is 004567 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 004568 ** contents of the entire transaction are rolled back. This is different 004569 ** from a normal transaction rollback, as no locks are released and the 004570 ** transaction remains open. 004571 */ 004572 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ 004573 int rc = SQLITE_OK; 004574 if( p && p->inTrans==TRANS_WRITE ){ 004575 BtShared *pBt = p->pBt; 004576 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK ); 004577 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) ); 004578 sqlite3BtreeEnter(p); 004579 if( op==SAVEPOINT_ROLLBACK ){ 004580 rc = saveAllCursors(pBt, 0, 0); 004581 } 004582 if( rc==SQLITE_OK ){ 004583 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint); 004584 } 004585 if( rc==SQLITE_OK ){ 004586 if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){ 004587 pBt->nPage = 0; 004588 } 004589 rc = newDatabase(pBt); 004590 btreeSetNPage(pBt, pBt->pPage1); 004591 004592 /* pBt->nPage might be zero if the database was corrupt when 004593 ** the transaction was started. Otherwise, it must be at least 1. */ 004594 assert( CORRUPT_DB || pBt->nPage>0 ); 004595 } 004596 sqlite3BtreeLeave(p); 004597 } 004598 return rc; 004599 } 004600 004601 /* 004602 ** Create a new cursor for the BTree whose root is on the page 004603 ** iTable. If a read-only cursor is requested, it is assumed that 004604 ** the caller already has at least a read-only transaction open 004605 ** on the database already. If a write-cursor is requested, then 004606 ** the caller is assumed to have an open write transaction. 004607 ** 004608 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only 004609 ** be used for reading. If the BTREE_WRCSR bit is set, then the cursor 004610 ** can be used for reading or for writing if other conditions for writing 004611 ** are also met. These are the conditions that must be met in order 004612 ** for writing to be allowed: 004613 ** 004614 ** 1: The cursor must have been opened with wrFlag containing BTREE_WRCSR 004615 ** 004616 ** 2: Other database connections that share the same pager cache 004617 ** but which are not in the READ_UNCOMMITTED state may not have 004618 ** cursors open with wrFlag==0 on the same table. Otherwise 004619 ** the changes made by this write cursor would be visible to 004620 ** the read cursors in the other database connection. 004621 ** 004622 ** 3: The database must be writable (not on read-only media) 004623 ** 004624 ** 4: There must be an active transaction. 004625 ** 004626 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR 004627 ** is set. If FORDELETE is set, that is a hint to the implementation that 004628 ** this cursor will only be used to seek to and delete entries of an index 004629 ** as part of a larger DELETE statement. The FORDELETE hint is not used by 004630 ** this implementation. But in a hypothetical alternative storage engine 004631 ** in which index entries are automatically deleted when corresponding table 004632 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE 004633 ** operations on this cursor can be no-ops and all READ operations can 004634 ** return a null row (2-bytes: 0x01 0x00). 004635 ** 004636 ** No checking is done to make sure that page iTable really is the 004637 ** root page of a b-tree. If it is not, then the cursor acquired 004638 ** will not work correctly. 004639 ** 004640 ** It is assumed that the sqlite3BtreeCursorZero() has been called 004641 ** on pCur to initialize the memory space prior to invoking this routine. 004642 */ 004643 static int btreeCursor( 004644 Btree *p, /* The btree */ 004645 Pgno iTable, /* Root page of table to open */ 004646 int wrFlag, /* 1 to write. 0 read-only */ 004647 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 004648 BtCursor *pCur /* Space for new cursor */ 004649 ){ 004650 BtShared *pBt = p->pBt; /* Shared b-tree handle */ 004651 BtCursor *pX; /* Looping over other all cursors */ 004652 004653 assert( sqlite3BtreeHoldsMutex(p) ); 004654 assert( wrFlag==0 004655 || wrFlag==BTREE_WRCSR 004656 || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE) 004657 ); 004658 004659 /* The following assert statements verify that if this is a sharable 004660 ** b-tree database, the connection is holding the required table locks, 004661 ** and that no other connection has any open cursor that conflicts with 004662 ** this lock. The iTable<1 term disables the check for corrupt schemas. */ 004663 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) 004664 || iTable<1 ); 004665 assert( wrFlag==0 || !hasReadConflicts(p, iTable) ); 004666 004667 /* Assert that the caller has opened the required transaction. */ 004668 assert( p->inTrans>TRANS_NONE ); 004669 assert( wrFlag==0 || p->inTrans==TRANS_WRITE ); 004670 assert( pBt->pPage1 && pBt->pPage1->aData ); 004671 assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 ); 004672 004673 if( iTable<=1 ){ 004674 if( iTable<1 ){ 004675 return SQLITE_CORRUPT_BKPT; 004676 }else if( btreePagecount(pBt)==0 ){ 004677 assert( wrFlag==0 ); 004678 iTable = 0; 004679 } 004680 } 004681 004682 /* Now that no other errors can occur, finish filling in the BtCursor 004683 ** variables and link the cursor into the BtShared list. */ 004684 pCur->pgnoRoot = iTable; 004685 pCur->iPage = -1; 004686 pCur->pKeyInfo = pKeyInfo; 004687 pCur->pBtree = p; 004688 pCur->pBt = pBt; 004689 pCur->curFlags = 0; 004690 /* If there are two or more cursors on the same btree, then all such 004691 ** cursors *must* have the BTCF_Multiple flag set. */ 004692 for(pX=pBt->pCursor; pX; pX=pX->pNext){ 004693 if( pX->pgnoRoot==iTable ){ 004694 pX->curFlags |= BTCF_Multiple; 004695 pCur->curFlags = BTCF_Multiple; 004696 } 004697 } 004698 pCur->eState = CURSOR_INVALID; 004699 pCur->pNext = pBt->pCursor; 004700 pBt->pCursor = pCur; 004701 if( wrFlag ){ 004702 pCur->curFlags |= BTCF_WriteFlag; 004703 pCur->curPagerFlags = 0; 004704 if( pBt->pTmpSpace==0 ) return allocateTempSpace(pBt); 004705 }else{ 004706 pCur->curPagerFlags = PAGER_GET_READONLY; 004707 } 004708 return SQLITE_OK; 004709 } 004710 static int btreeCursorWithLock( 004711 Btree *p, /* The btree */ 004712 Pgno iTable, /* Root page of table to open */ 004713 int wrFlag, /* 1 to write. 0 read-only */ 004714 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 004715 BtCursor *pCur /* Space for new cursor */ 004716 ){ 004717 int rc; 004718 sqlite3BtreeEnter(p); 004719 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 004720 sqlite3BtreeLeave(p); 004721 return rc; 004722 } 004723 int sqlite3BtreeCursor( 004724 Btree *p, /* The btree */ 004725 Pgno iTable, /* Root page of table to open */ 004726 int wrFlag, /* 1 to write. 0 read-only */ 004727 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */ 004728 BtCursor *pCur /* Write new cursor here */ 004729 ){ 004730 if( p->sharable ){ 004731 return btreeCursorWithLock(p, iTable, wrFlag, pKeyInfo, pCur); 004732 }else{ 004733 return btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 004734 } 004735 } 004736 004737 /* 004738 ** Return the size of a BtCursor object in bytes. 004739 ** 004740 ** This interfaces is needed so that users of cursors can preallocate 004741 ** sufficient storage to hold a cursor. The BtCursor object is opaque 004742 ** to users so they cannot do the sizeof() themselves - they must call 004743 ** this routine. 004744 */ 004745 int sqlite3BtreeCursorSize(void){ 004746 return ROUND8(sizeof(BtCursor)); 004747 } 004748 004749 #ifdef SQLITE_DEBUG 004750 /* 004751 ** Return true if and only if the Btree object will be automatically 004752 ** closed with the BtCursor closes. This is used within assert() statements 004753 ** only. 004754 */ 004755 int sqlite3BtreeClosesWithCursor( 004756 Btree *pBtree, /* the btree object */ 004757 BtCursor *pCur /* Corresponding cursor */ 004758 ){ 004759 BtShared *pBt = pBtree->pBt; 004760 if( (pBt->openFlags & BTREE_SINGLE)==0 ) return 0; 004761 if( pBt->pCursor!=pCur ) return 0; 004762 if( pCur->pNext!=0 ) return 0; 004763 if( pCur->pBtree!=pBtree ) return 0; 004764 return 1; 004765 } 004766 #endif 004767 004768 /* 004769 ** Initialize memory that will be converted into a BtCursor object. 004770 ** 004771 ** The simple approach here would be to memset() the entire object 004772 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays 004773 ** do not need to be zeroed and they are large, so we can save a lot 004774 ** of run-time by skipping the initialization of those elements. 004775 */ 004776 void sqlite3BtreeCursorZero(BtCursor *p){ 004777 memset(p, 0, offsetof(BtCursor, BTCURSOR_FIRST_UNINIT)); 004778 } 004779 004780 /* 004781 ** Close a cursor. The read lock on the database file is released 004782 ** when the last cursor is closed. 004783 */ 004784 int sqlite3BtreeCloseCursor(BtCursor *pCur){ 004785 Btree *pBtree = pCur->pBtree; 004786 if( pBtree ){ 004787 BtShared *pBt = pCur->pBt; 004788 sqlite3BtreeEnter(pBtree); 004789 assert( pBt->pCursor!=0 ); 004790 if( pBt->pCursor==pCur ){ 004791 pBt->pCursor = pCur->pNext; 004792 }else{ 004793 BtCursor *pPrev = pBt->pCursor; 004794 do{ 004795 if( pPrev->pNext==pCur ){ 004796 pPrev->pNext = pCur->pNext; 004797 break; 004798 } 004799 pPrev = pPrev->pNext; 004800 }while( ALWAYS(pPrev) ); 004801 } 004802 btreeReleaseAllCursorPages(pCur); 004803 unlockBtreeIfUnused(pBt); 004804 sqlite3_free(pCur->aOverflow); 004805 sqlite3_free(pCur->pKey); 004806 if( (pBt->openFlags & BTREE_SINGLE) && pBt->pCursor==0 ){ 004807 /* Since the BtShared is not sharable, there is no need to 004808 ** worry about the missing sqlite3BtreeLeave() call here. */ 004809 assert( pBtree->sharable==0 ); 004810 sqlite3BtreeClose(pBtree); 004811 }else{ 004812 sqlite3BtreeLeave(pBtree); 004813 } 004814 pCur->pBtree = 0; 004815 } 004816 return SQLITE_OK; 004817 } 004818 004819 /* 004820 ** Make sure the BtCursor* given in the argument has a valid 004821 ** BtCursor.info structure. If it is not already valid, call 004822 ** btreeParseCell() to fill it in. 004823 ** 004824 ** BtCursor.info is a cache of the information in the current cell. 004825 ** Using this cache reduces the number of calls to btreeParseCell(). 004826 */ 004827 #ifndef NDEBUG 004828 static int cellInfoEqual(CellInfo *a, CellInfo *b){ 004829 if( a->nKey!=b->nKey ) return 0; 004830 if( a->pPayload!=b->pPayload ) return 0; 004831 if( a->nPayload!=b->nPayload ) return 0; 004832 if( a->nLocal!=b->nLocal ) return 0; 004833 if( a->nSize!=b->nSize ) return 0; 004834 return 1; 004835 } 004836 static void assertCellInfo(BtCursor *pCur){ 004837 CellInfo info; 004838 memset(&info, 0, sizeof(info)); 004839 btreeParseCell(pCur->pPage, pCur->ix, &info); 004840 assert( CORRUPT_DB || cellInfoEqual(&info, &pCur->info) ); 004841 } 004842 #else 004843 #define assertCellInfo(x) 004844 #endif 004845 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){ 004846 if( pCur->info.nSize==0 ){ 004847 pCur->curFlags |= BTCF_ValidNKey; 004848 btreeParseCell(pCur->pPage,pCur->ix,&pCur->info); 004849 }else{ 004850 assertCellInfo(pCur); 004851 } 004852 } 004853 004854 #ifndef NDEBUG /* The next routine used only within assert() statements */ 004855 /* 004856 ** Return true if the given BtCursor is valid. A valid cursor is one 004857 ** that is currently pointing to a row in a (non-empty) table. 004858 ** This is a verification routine is used only within assert() statements. 004859 */ 004860 int sqlite3BtreeCursorIsValid(BtCursor *pCur){ 004861 return pCur && pCur->eState==CURSOR_VALID; 004862 } 004863 #endif /* NDEBUG */ 004864 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){ 004865 assert( pCur!=0 ); 004866 return pCur->eState==CURSOR_VALID; 004867 } 004868 004869 /* 004870 ** Return the value of the integer key or "rowid" for a table btree. 004871 ** This routine is only valid for a cursor that is pointing into a 004872 ** ordinary table btree. If the cursor points to an index btree or 004873 ** is invalid, the result of this routine is undefined. 004874 */ 004875 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){ 004876 assert( cursorHoldsMutex(pCur) ); 004877 assert( pCur->eState==CURSOR_VALID ); 004878 assert( pCur->curIntKey ); 004879 getCellInfo(pCur); 004880 return pCur->info.nKey; 004881 } 004882 004883 /* 004884 ** Pin or unpin a cursor. 004885 */ 004886 void sqlite3BtreeCursorPin(BtCursor *pCur){ 004887 assert( (pCur->curFlags & BTCF_Pinned)==0 ); 004888 pCur->curFlags |= BTCF_Pinned; 004889 } 004890 void sqlite3BtreeCursorUnpin(BtCursor *pCur){ 004891 assert( (pCur->curFlags & BTCF_Pinned)!=0 ); 004892 pCur->curFlags &= ~BTCF_Pinned; 004893 } 004894 004895 /* 004896 ** Return the offset into the database file for the start of the 004897 ** payload to which the cursor is pointing. 004898 */ 004899 i64 sqlite3BtreeOffset(BtCursor *pCur){ 004900 assert( cursorHoldsMutex(pCur) ); 004901 assert( pCur->eState==CURSOR_VALID ); 004902 getCellInfo(pCur); 004903 return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) + 004904 (i64)(pCur->info.pPayload - pCur->pPage->aData); 004905 } 004906 004907 /* 004908 ** Return the number of bytes of payload for the entry that pCur is 004909 ** currently pointing to. For table btrees, this will be the amount 004910 ** of data. For index btrees, this will be the size of the key. 004911 ** 004912 ** The caller must guarantee that the cursor is pointing to a non-NULL 004913 ** valid entry. In other words, the calling procedure must guarantee 004914 ** that the cursor has Cursor.eState==CURSOR_VALID. 004915 */ 004916 u32 sqlite3BtreePayloadSize(BtCursor *pCur){ 004917 assert( cursorHoldsMutex(pCur) ); 004918 assert( pCur->eState==CURSOR_VALID ); 004919 getCellInfo(pCur); 004920 return pCur->info.nPayload; 004921 } 004922 004923 /* 004924 ** Return an upper bound on the size of any record for the table 004925 ** that the cursor is pointing into. 004926 ** 004927 ** This is an optimization. Everything will still work if this 004928 ** routine always returns 2147483647 (which is the largest record 004929 ** that SQLite can handle) or more. But returning a smaller value might 004930 ** prevent large memory allocations when trying to interpret a 004931 ** corrupt database. 004932 ** 004933 ** The current implementation merely returns the size of the underlying 004934 ** database file. 004935 */ 004936 sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){ 004937 assert( cursorHoldsMutex(pCur) ); 004938 assert( pCur->eState==CURSOR_VALID ); 004939 return pCur->pBt->pageSize * (sqlite3_int64)pCur->pBt->nPage; 004940 } 004941 004942 /* 004943 ** Given the page number of an overflow page in the database (parameter 004944 ** ovfl), this function finds the page number of the next page in the 004945 ** linked list of overflow pages. If possible, it uses the auto-vacuum 004946 ** pointer-map data instead of reading the content of page ovfl to do so. 004947 ** 004948 ** If an error occurs an SQLite error code is returned. Otherwise: 004949 ** 004950 ** The page number of the next overflow page in the linked list is 004951 ** written to *pPgnoNext. If page ovfl is the last page in its linked 004952 ** list, *pPgnoNext is set to zero. 004953 ** 004954 ** If ppPage is not NULL, and a reference to the MemPage object corresponding 004955 ** to page number pOvfl was obtained, then *ppPage is set to point to that 004956 ** reference. It is the responsibility of the caller to call releasePage() 004957 ** on *ppPage to free the reference. In no reference was obtained (because 004958 ** the pointer-map was used to obtain the value for *pPgnoNext), then 004959 ** *ppPage is set to zero. 004960 */ 004961 static int getOverflowPage( 004962 BtShared *pBt, /* The database file */ 004963 Pgno ovfl, /* Current overflow page number */ 004964 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */ 004965 Pgno *pPgnoNext /* OUT: Next overflow page number */ 004966 ){ 004967 Pgno next = 0; 004968 MemPage *pPage = 0; 004969 int rc = SQLITE_OK; 004970 004971 assert( sqlite3_mutex_held(pBt->mutex) ); 004972 assert(pPgnoNext); 004973 004974 #ifndef SQLITE_OMIT_AUTOVACUUM 004975 /* Try to find the next page in the overflow list using the 004976 ** autovacuum pointer-map pages. Guess that the next page in 004977 ** the overflow list is page number (ovfl+1). If that guess turns 004978 ** out to be wrong, fall back to loading the data of page 004979 ** number ovfl to determine the next page number. 004980 */ 004981 if( pBt->autoVacuum ){ 004982 Pgno pgno; 004983 Pgno iGuess = ovfl+1; 004984 u8 eType; 004985 004986 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ 004987 iGuess++; 004988 } 004989 004990 if( iGuess<=btreePagecount(pBt) ){ 004991 rc = ptrmapGet(pBt, iGuess, &eType, &pgno); 004992 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ 004993 next = iGuess; 004994 rc = SQLITE_DONE; 004995 } 004996 } 004997 } 004998 #endif 004999 005000 assert( next==0 || rc==SQLITE_DONE ); 005001 if( rc==SQLITE_OK ){ 005002 rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0); 005003 assert( rc==SQLITE_OK || pPage==0 ); 005004 if( rc==SQLITE_OK ){ 005005 next = get4byte(pPage->aData); 005006 } 005007 } 005008 005009 *pPgnoNext = next; 005010 if( ppPage ){ 005011 *ppPage = pPage; 005012 }else{ 005013 releasePage(pPage); 005014 } 005015 return (rc==SQLITE_DONE ? SQLITE_OK : rc); 005016 } 005017 005018 /* 005019 ** Copy data from a buffer to a page, or from a page to a buffer. 005020 ** 005021 ** pPayload is a pointer to data stored on database page pDbPage. 005022 ** If argument eOp is false, then nByte bytes of data are copied 005023 ** from pPayload to the buffer pointed at by pBuf. If eOp is true, 005024 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes 005025 ** of data are copied from the buffer pBuf to pPayload. 005026 ** 005027 ** SQLITE_OK is returned on success, otherwise an error code. 005028 */ 005029 static int copyPayload( 005030 void *pPayload, /* Pointer to page data */ 005031 void *pBuf, /* Pointer to buffer */ 005032 int nByte, /* Number of bytes to copy */ 005033 int eOp, /* 0 -> copy from page, 1 -> copy to page */ 005034 DbPage *pDbPage /* Page containing pPayload */ 005035 ){ 005036 if( eOp ){ 005037 /* Copy data from buffer to page (a write operation) */ 005038 int rc = sqlite3PagerWrite(pDbPage); 005039 if( rc!=SQLITE_OK ){ 005040 return rc; 005041 } 005042 memcpy(pPayload, pBuf, nByte); 005043 }else{ 005044 /* Copy data from page to buffer (a read operation) */ 005045 memcpy(pBuf, pPayload, nByte); 005046 } 005047 return SQLITE_OK; 005048 } 005049 005050 /* 005051 ** This function is used to read or overwrite payload information 005052 ** for the entry that the pCur cursor is pointing to. The eOp 005053 ** argument is interpreted as follows: 005054 ** 005055 ** 0: The operation is a read. Populate the overflow cache. 005056 ** 1: The operation is a write. Populate the overflow cache. 005057 ** 005058 ** A total of "amt" bytes are read or written beginning at "offset". 005059 ** Data is read to or from the buffer pBuf. 005060 ** 005061 ** The content being read or written might appear on the main page 005062 ** or be scattered out on multiple overflow pages. 005063 ** 005064 ** If the current cursor entry uses one or more overflow pages 005065 ** this function may allocate space for and lazily populate 005066 ** the overflow page-list cache array (BtCursor.aOverflow). 005067 ** Subsequent calls use this cache to make seeking to the supplied offset 005068 ** more efficient. 005069 ** 005070 ** Once an overflow page-list cache has been allocated, it must be 005071 ** invalidated if some other cursor writes to the same table, or if 005072 ** the cursor is moved to a different row. Additionally, in auto-vacuum 005073 ** mode, the following events may invalidate an overflow page-list cache. 005074 ** 005075 ** * An incremental vacuum, 005076 ** * A commit in auto_vacuum="full" mode, 005077 ** * Creating a table (may require moving an overflow page). 005078 */ 005079 static int accessPayload( 005080 BtCursor *pCur, /* Cursor pointing to entry to read from */ 005081 u32 offset, /* Begin reading this far into payload */ 005082 u32 amt, /* Read this many bytes */ 005083 unsigned char *pBuf, /* Write the bytes into this buffer */ 005084 int eOp /* zero to read. non-zero to write. */ 005085 ){ 005086 unsigned char *aPayload; 005087 int rc = SQLITE_OK; 005088 int iIdx = 0; 005089 MemPage *pPage = pCur->pPage; /* Btree page of current entry */ 005090 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */ 005091 #ifdef SQLITE_DIRECT_OVERFLOW_READ 005092 unsigned char * const pBufStart = pBuf; /* Start of original out buffer */ 005093 #endif 005094 005095 assert( pPage ); 005096 assert( eOp==0 || eOp==1 ); 005097 assert( pCur->eState==CURSOR_VALID ); 005098 if( pCur->ix>=pPage->nCell ){ 005099 return SQLITE_CORRUPT_PAGE(pPage); 005100 } 005101 assert( cursorHoldsMutex(pCur) ); 005102 005103 getCellInfo(pCur); 005104 aPayload = pCur->info.pPayload; 005105 assert( offset+amt <= pCur->info.nPayload ); 005106 005107 assert( aPayload > pPage->aData ); 005108 if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){ 005109 /* Trying to read or write past the end of the data is an error. The 005110 ** conditional above is really: 005111 ** &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] 005112 ** but is recast into its current form to avoid integer overflow problems 005113 */ 005114 return SQLITE_CORRUPT_PAGE(pPage); 005115 } 005116 005117 /* Check if data must be read/written to/from the btree page itself. */ 005118 if( offset<pCur->info.nLocal ){ 005119 int a = amt; 005120 if( a+offset>pCur->info.nLocal ){ 005121 a = pCur->info.nLocal - offset; 005122 } 005123 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); 005124 offset = 0; 005125 pBuf += a; 005126 amt -= a; 005127 }else{ 005128 offset -= pCur->info.nLocal; 005129 } 005130 005131 005132 if( rc==SQLITE_OK && amt>0 ){ 005133 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ 005134 Pgno nextPage; 005135 005136 nextPage = get4byte(&aPayload[pCur->info.nLocal]); 005137 005138 /* If the BtCursor.aOverflow[] has not been allocated, allocate it now. 005139 ** 005140 ** The aOverflow[] array is sized at one entry for each overflow page 005141 ** in the overflow chain. The page number of the first overflow page is 005142 ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array 005143 ** means "not yet known" (the cache is lazily populated). 005144 */ 005145 if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){ 005146 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; 005147 if( pCur->aOverflow==0 005148 || nOvfl*(int)sizeof(Pgno) > sqlite3MallocSize(pCur->aOverflow) 005149 ){ 005150 Pgno *aNew; 005151 if( sqlite3FaultSim(413) ){ 005152 aNew = 0; 005153 }else{ 005154 aNew = (Pgno*)sqlite3Realloc(pCur->aOverflow, nOvfl*2*sizeof(Pgno)); 005155 } 005156 if( aNew==0 ){ 005157 return SQLITE_NOMEM_BKPT; 005158 }else{ 005159 pCur->aOverflow = aNew; 005160 } 005161 } 005162 memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno)); 005163 pCur->curFlags |= BTCF_ValidOvfl; 005164 }else{ 005165 /* Sanity check the validity of the overflow page cache */ 005166 assert( pCur->aOverflow[0]==nextPage 005167 || pCur->aOverflow[0]==0 005168 || CORRUPT_DB ); 005169 assert( pCur->aOverflow[0]!=0 || pCur->aOverflow[offset/ovflSize]==0 ); 005170 005171 /* If the overflow page-list cache has been allocated and the 005172 ** entry for the first required overflow page is valid, skip 005173 ** directly to it. 005174 */ 005175 if( pCur->aOverflow[offset/ovflSize] ){ 005176 iIdx = (offset/ovflSize); 005177 nextPage = pCur->aOverflow[iIdx]; 005178 offset = (offset%ovflSize); 005179 } 005180 } 005181 005182 assert( rc==SQLITE_OK && amt>0 ); 005183 while( nextPage ){ 005184 /* If required, populate the overflow page-list cache. */ 005185 if( nextPage > pBt->nPage ) return SQLITE_CORRUPT_BKPT; 005186 assert( pCur->aOverflow[iIdx]==0 005187 || pCur->aOverflow[iIdx]==nextPage 005188 || CORRUPT_DB ); 005189 pCur->aOverflow[iIdx] = nextPage; 005190 005191 if( offset>=ovflSize ){ 005192 /* The only reason to read this page is to obtain the page 005193 ** number for the next page in the overflow chain. The page 005194 ** data is not required. So first try to lookup the overflow 005195 ** page-list cache, if any, then fall back to the getOverflowPage() 005196 ** function. 005197 */ 005198 assert( pCur->curFlags & BTCF_ValidOvfl ); 005199 assert( pCur->pBtree->db==pBt->db ); 005200 if( pCur->aOverflow[iIdx+1] ){ 005201 nextPage = pCur->aOverflow[iIdx+1]; 005202 }else{ 005203 rc = getOverflowPage(pBt, nextPage, 0, &nextPage); 005204 } 005205 offset -= ovflSize; 005206 }else{ 005207 /* Need to read this page properly. It contains some of the 005208 ** range of data that is being read (eOp==0) or written (eOp!=0). 005209 */ 005210 int a = amt; 005211 if( a + offset > ovflSize ){ 005212 a = ovflSize - offset; 005213 } 005214 005215 #ifdef SQLITE_DIRECT_OVERFLOW_READ 005216 /* If all the following are true: 005217 ** 005218 ** 1) this is a read operation, and 005219 ** 2) data is required from the start of this overflow page, and 005220 ** 3) there are no dirty pages in the page-cache 005221 ** 4) the database is file-backed, and 005222 ** 5) the page is not in the WAL file 005223 ** 6) at least 4 bytes have already been read into the output buffer 005224 ** 005225 ** then data can be read directly from the database file into the 005226 ** output buffer, bypassing the page-cache altogether. This speeds 005227 ** up loading large records that span many overflow pages. 005228 */ 005229 if( eOp==0 /* (1) */ 005230 && offset==0 /* (2) */ 005231 && sqlite3PagerDirectReadOk(pBt->pPager, nextPage) /* (3,4,5) */ 005232 && &pBuf[-4]>=pBufStart /* (6) */ 005233 ){ 005234 sqlite3_file *fd = sqlite3PagerFile(pBt->pPager); 005235 u8 aSave[4]; 005236 u8 *aWrite = &pBuf[-4]; 005237 assert( aWrite>=pBufStart ); /* due to (6) */ 005238 memcpy(aSave, aWrite, 4); 005239 rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1)); 005240 nextPage = get4byte(aWrite); 005241 memcpy(aWrite, aSave, 4); 005242 }else 005243 #endif 005244 005245 { 005246 DbPage *pDbPage; 005247 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage, 005248 (eOp==0 ? PAGER_GET_READONLY : 0) 005249 ); 005250 if( rc==SQLITE_OK ){ 005251 aPayload = sqlite3PagerGetData(pDbPage); 005252 nextPage = get4byte(aPayload); 005253 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); 005254 sqlite3PagerUnref(pDbPage); 005255 offset = 0; 005256 } 005257 } 005258 amt -= a; 005259 if( amt==0 ) return rc; 005260 pBuf += a; 005261 } 005262 if( rc ) break; 005263 iIdx++; 005264 } 005265 } 005266 005267 if( rc==SQLITE_OK && amt>0 ){ 005268 /* Overflow chain ends prematurely */ 005269 return SQLITE_CORRUPT_PAGE(pPage); 005270 } 005271 return rc; 005272 } 005273 005274 /* 005275 ** Read part of the payload for the row at which that cursor pCur is currently 005276 ** pointing. "amt" bytes will be transferred into pBuf[]. The transfer 005277 ** begins at "offset". 005278 ** 005279 ** pCur can be pointing to either a table or an index b-tree. 005280 ** If pointing to a table btree, then the content section is read. If 005281 ** pCur is pointing to an index b-tree then the key section is read. 005282 ** 005283 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing 005284 ** to a valid row in the table. For sqlite3BtreePayloadChecked(), the 005285 ** cursor might be invalid or might need to be restored before being read. 005286 ** 005287 ** Return SQLITE_OK on success or an error code if anything goes 005288 ** wrong. An error is returned if "offset+amt" is larger than 005289 ** the available payload. 005290 */ 005291 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 005292 assert( cursorHoldsMutex(pCur) ); 005293 assert( pCur->eState==CURSOR_VALID ); 005294 assert( pCur->iPage>=0 && pCur->pPage ); 005295 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); 005296 } 005297 005298 /* 005299 ** This variant of sqlite3BtreePayload() works even if the cursor has not 005300 ** in the CURSOR_VALID state. It is only used by the sqlite3_blob_read() 005301 ** interface. 005302 */ 005303 #ifndef SQLITE_OMIT_INCRBLOB 005304 static SQLITE_NOINLINE int accessPayloadChecked( 005305 BtCursor *pCur, 005306 u32 offset, 005307 u32 amt, 005308 void *pBuf 005309 ){ 005310 int rc; 005311 if ( pCur->eState==CURSOR_INVALID ){ 005312 return SQLITE_ABORT; 005313 } 005314 assert( cursorOwnsBtShared(pCur) ); 005315 rc = btreeRestoreCursorPosition(pCur); 005316 return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0); 005317 } 005318 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 005319 if( pCur->eState==CURSOR_VALID ){ 005320 assert( cursorOwnsBtShared(pCur) ); 005321 return accessPayload(pCur, offset, amt, pBuf, 0); 005322 }else{ 005323 return accessPayloadChecked(pCur, offset, amt, pBuf); 005324 } 005325 } 005326 #endif /* SQLITE_OMIT_INCRBLOB */ 005327 005328 /* 005329 ** Return a pointer to payload information from the entry that the 005330 ** pCur cursor is pointing to. The pointer is to the beginning of 005331 ** the key if index btrees (pPage->intKey==0) and is the data for 005332 ** table btrees (pPage->intKey==1). The number of bytes of available 005333 ** key/data is written into *pAmt. If *pAmt==0, then the value 005334 ** returned will not be a valid pointer. 005335 ** 005336 ** This routine is an optimization. It is common for the entire key 005337 ** and data to fit on the local page and for there to be no overflow 005338 ** pages. When that is so, this routine can be used to access the 005339 ** key and data without making a copy. If the key and/or data spills 005340 ** onto overflow pages, then accessPayload() must be used to reassemble 005341 ** the key/data and copy it into a preallocated buffer. 005342 ** 005343 ** The pointer returned by this routine looks directly into the cached 005344 ** page of the database. The data might change or move the next time 005345 ** any btree routine is called. 005346 */ 005347 static const void *fetchPayload( 005348 BtCursor *pCur, /* Cursor pointing to entry to read from */ 005349 u32 *pAmt /* Write the number of available bytes here */ 005350 ){ 005351 int amt; 005352 assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage); 005353 assert( pCur->eState==CURSOR_VALID ); 005354 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005355 assert( cursorOwnsBtShared(pCur) ); 005356 assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB ); 005357 assert( pCur->info.nSize>0 ); 005358 assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB ); 005359 assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB); 005360 amt = pCur->info.nLocal; 005361 if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){ 005362 /* There is too little space on the page for the expected amount 005363 ** of local content. Database must be corrupt. */ 005364 assert( CORRUPT_DB ); 005365 amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload)); 005366 } 005367 *pAmt = (u32)amt; 005368 return (void*)pCur->info.pPayload; 005369 } 005370 005371 005372 /* 005373 ** For the entry that cursor pCur is point to, return as 005374 ** many bytes of the key or data as are available on the local 005375 ** b-tree page. Write the number of available bytes into *pAmt. 005376 ** 005377 ** The pointer returned is ephemeral. The key/data may move 005378 ** or be destroyed on the next call to any Btree routine, 005379 ** including calls from other threads against the same cache. 005380 ** Hence, a mutex on the BtShared should be held prior to calling 005381 ** this routine. 005382 ** 005383 ** These routines is used to get quick access to key and data 005384 ** in the common case where no overflow pages are used. 005385 */ 005386 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){ 005387 return fetchPayload(pCur, pAmt); 005388 } 005389 005390 005391 /* 005392 ** Move the cursor down to a new child page. The newPgno argument is the 005393 ** page number of the child page to move to. 005394 ** 005395 ** This function returns SQLITE_CORRUPT if the page-header flags field of 005396 ** the new child page does not match the flags field of the parent (i.e. 005397 ** if an intkey page appears to be the parent of a non-intkey page, or 005398 ** vice-versa). 005399 */ 005400 static int moveToChild(BtCursor *pCur, u32 newPgno){ 005401 int rc; 005402 assert( cursorOwnsBtShared(pCur) ); 005403 assert( pCur->eState==CURSOR_VALID ); 005404 assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); 005405 assert( pCur->iPage>=0 ); 005406 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 005407 return SQLITE_CORRUPT_BKPT; 005408 } 005409 pCur->info.nSize = 0; 005410 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 005411 pCur->aiIdx[pCur->iPage] = pCur->ix; 005412 pCur->apPage[pCur->iPage] = pCur->pPage; 005413 pCur->ix = 0; 005414 pCur->iPage++; 005415 rc = getAndInitPage(pCur->pBt, newPgno, &pCur->pPage, pCur->curPagerFlags); 005416 assert( pCur->pPage!=0 || rc!=SQLITE_OK ); 005417 if( rc==SQLITE_OK 005418 && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey) 005419 ){ 005420 releasePage(pCur->pPage); 005421 rc = SQLITE_CORRUPT_PGNO(newPgno); 005422 } 005423 if( rc ){ 005424 pCur->pPage = pCur->apPage[--pCur->iPage]; 005425 } 005426 return rc; 005427 } 005428 005429 #ifdef SQLITE_DEBUG 005430 /* 005431 ** Page pParent is an internal (non-leaf) tree page. This function 005432 ** asserts that page number iChild is the left-child if the iIdx'th 005433 ** cell in page pParent. Or, if iIdx is equal to the total number of 005434 ** cells in pParent, that page number iChild is the right-child of 005435 ** the page. 005436 */ 005437 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ 005438 if( CORRUPT_DB ) return; /* The conditions tested below might not be true 005439 ** in a corrupt database */ 005440 assert( iIdx<=pParent->nCell ); 005441 if( iIdx==pParent->nCell ){ 005442 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild ); 005443 }else{ 005444 assert( get4byte(findCell(pParent, iIdx))==iChild ); 005445 } 005446 } 005447 #else 005448 # define assertParentIndex(x,y,z) 005449 #endif 005450 005451 /* 005452 ** Move the cursor up to the parent page. 005453 ** 005454 ** pCur->idx is set to the cell index that contains the pointer 005455 ** to the page we are coming from. If we are coming from the 005456 ** right-most child page then pCur->idx is set to one more than 005457 ** the largest cell index. 005458 */ 005459 static void moveToParent(BtCursor *pCur){ 005460 MemPage *pLeaf; 005461 assert( cursorOwnsBtShared(pCur) ); 005462 assert( pCur->eState==CURSOR_VALID ); 005463 assert( pCur->iPage>0 ); 005464 assert( pCur->pPage ); 005465 assertParentIndex( 005466 pCur->apPage[pCur->iPage-1], 005467 pCur->aiIdx[pCur->iPage-1], 005468 pCur->pPage->pgno 005469 ); 005470 testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell ); 005471 pCur->info.nSize = 0; 005472 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 005473 pCur->ix = pCur->aiIdx[pCur->iPage-1]; 005474 pLeaf = pCur->pPage; 005475 pCur->pPage = pCur->apPage[--pCur->iPage]; 005476 releasePageNotNull(pLeaf); 005477 } 005478 005479 /* 005480 ** Move the cursor to point to the root page of its b-tree structure. 005481 ** 005482 ** If the table has a virtual root page, then the cursor is moved to point 005483 ** to the virtual root page instead of the actual root page. A table has a 005484 ** virtual root page when the actual root page contains no cells and a 005485 ** single child page. This can only happen with the table rooted at page 1. 005486 ** 005487 ** If the b-tree structure is empty, the cursor state is set to 005488 ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise, 005489 ** the cursor is set to point to the first cell located on the root 005490 ** (or virtual root) page and the cursor state is set to CURSOR_VALID. 005491 ** 005492 ** If this function returns successfully, it may be assumed that the 005493 ** page-header flags indicate that the [virtual] root-page is the expected 005494 ** kind of b-tree page (i.e. if when opening the cursor the caller did not 005495 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D, 005496 ** indicating a table b-tree, or if the caller did specify a KeyInfo 005497 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index 005498 ** b-tree). 005499 */ 005500 static int moveToRoot(BtCursor *pCur){ 005501 MemPage *pRoot; 005502 int rc = SQLITE_OK; 005503 005504 assert( cursorOwnsBtShared(pCur) ); 005505 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 005506 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 005507 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 005508 assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 ); 005509 assert( pCur->pgnoRoot>0 || pCur->iPage<0 ); 005510 005511 if( pCur->iPage>=0 ){ 005512 if( pCur->iPage ){ 005513 releasePageNotNull(pCur->pPage); 005514 while( --pCur->iPage ){ 005515 releasePageNotNull(pCur->apPage[pCur->iPage]); 005516 } 005517 pRoot = pCur->pPage = pCur->apPage[0]; 005518 goto skip_init; 005519 } 005520 }else if( pCur->pgnoRoot==0 ){ 005521 pCur->eState = CURSOR_INVALID; 005522 return SQLITE_EMPTY; 005523 }else{ 005524 assert( pCur->iPage==(-1) ); 005525 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 005526 if( pCur->eState==CURSOR_FAULT ){ 005527 assert( pCur->skipNext!=SQLITE_OK ); 005528 return pCur->skipNext; 005529 } 005530 sqlite3BtreeClearCursor(pCur); 005531 } 005532 rc = getAndInitPage(pCur->pBt, pCur->pgnoRoot, &pCur->pPage, 005533 pCur->curPagerFlags); 005534 if( rc!=SQLITE_OK ){ 005535 pCur->eState = CURSOR_INVALID; 005536 return rc; 005537 } 005538 pCur->iPage = 0; 005539 pCur->curIntKey = pCur->pPage->intKey; 005540 } 005541 pRoot = pCur->pPage; 005542 assert( pRoot->pgno==pCur->pgnoRoot || CORRUPT_DB ); 005543 005544 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor 005545 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is 005546 ** NULL, the caller expects a table b-tree. If this is not the case, 005547 ** return an SQLITE_CORRUPT error. 005548 ** 005549 ** Earlier versions of SQLite assumed that this test could not fail 005550 ** if the root page was already loaded when this function was called (i.e. 005551 ** if pCur->iPage>=0). But this is not so if the database is corrupted 005552 ** in such a way that page pRoot is linked into a second b-tree table 005553 ** (or the freelist). */ 005554 assert( pRoot->intKey==1 || pRoot->intKey==0 ); 005555 if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){ 005556 return SQLITE_CORRUPT_PAGE(pCur->pPage); 005557 } 005558 005559 skip_init: 005560 pCur->ix = 0; 005561 pCur->info.nSize = 0; 005562 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl); 005563 005564 if( pRoot->nCell>0 ){ 005565 pCur->eState = CURSOR_VALID; 005566 }else if( !pRoot->leaf ){ 005567 Pgno subpage; 005568 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT; 005569 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 005570 pCur->eState = CURSOR_VALID; 005571 rc = moveToChild(pCur, subpage); 005572 }else{ 005573 pCur->eState = CURSOR_INVALID; 005574 rc = SQLITE_EMPTY; 005575 } 005576 return rc; 005577 } 005578 005579 /* 005580 ** Move the cursor down to the left-most leaf entry beneath the 005581 ** entry to which it is currently pointing. 005582 ** 005583 ** The left-most leaf is the one with the smallest key - the first 005584 ** in ascending order. 005585 */ 005586 static int moveToLeftmost(BtCursor *pCur){ 005587 Pgno pgno; 005588 int rc = SQLITE_OK; 005589 MemPage *pPage; 005590 005591 assert( cursorOwnsBtShared(pCur) ); 005592 assert( pCur->eState==CURSOR_VALID ); 005593 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ 005594 assert( pCur->ix<pPage->nCell ); 005595 pgno = get4byte(findCell(pPage, pCur->ix)); 005596 rc = moveToChild(pCur, pgno); 005597 } 005598 return rc; 005599 } 005600 005601 /* 005602 ** Move the cursor down to the right-most leaf entry beneath the 005603 ** page to which it is currently pointing. Notice the difference 005604 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 005605 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 005606 ** finds the right-most entry beneath the *page*. 005607 ** 005608 ** The right-most entry is the one with the largest key - the last 005609 ** key in ascending order. 005610 */ 005611 static int moveToRightmost(BtCursor *pCur){ 005612 Pgno pgno; 005613 int rc = SQLITE_OK; 005614 MemPage *pPage = 0; 005615 005616 assert( cursorOwnsBtShared(pCur) ); 005617 assert( pCur->eState==CURSOR_VALID ); 005618 while( !(pPage = pCur->pPage)->leaf ){ 005619 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 005620 pCur->ix = pPage->nCell; 005621 rc = moveToChild(pCur, pgno); 005622 if( rc ) return rc; 005623 } 005624 pCur->ix = pPage->nCell-1; 005625 assert( pCur->info.nSize==0 ); 005626 assert( (pCur->curFlags & BTCF_ValidNKey)==0 ); 005627 return SQLITE_OK; 005628 } 005629 005630 /* Move the cursor to the first entry in the table. Return SQLITE_OK 005631 ** on success. Set *pRes to 0 if the cursor actually points to something 005632 ** or set *pRes to 1 if the table is empty. 005633 */ 005634 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 005635 int rc; 005636 005637 assert( cursorOwnsBtShared(pCur) ); 005638 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005639 rc = moveToRoot(pCur); 005640 if( rc==SQLITE_OK ){ 005641 assert( pCur->pPage->nCell>0 ); 005642 *pRes = 0; 005643 rc = moveToLeftmost(pCur); 005644 }else if( rc==SQLITE_EMPTY ){ 005645 assert( pCur->pgnoRoot==0 || (pCur->pPage!=0 && pCur->pPage->nCell==0) ); 005646 *pRes = 1; 005647 rc = SQLITE_OK; 005648 } 005649 return rc; 005650 } 005651 005652 #ifdef SQLITE_DEBUG 005653 /* The cursors is CURSOR_VALID and has BTCF_AtLast set. Verify that 005654 ** this flags are true for a consistent database. 005655 ** 005656 ** This routine is is called from within assert() statements only. 005657 ** It is an internal verification routine and does not appear in production 005658 ** builds. 005659 */ 005660 static int cursorIsAtLastEntry(BtCursor *pCur){ 005661 int ii; 005662 for(ii=0; ii<pCur->iPage; ii++){ 005663 if( pCur->aiIdx[ii]!=pCur->apPage[ii]->nCell ) return 0; 005664 } 005665 return pCur->ix==pCur->pPage->nCell-1 && pCur->pPage->leaf!=0; 005666 } 005667 #endif 005668 005669 /* Move the cursor to the last entry in the table. Return SQLITE_OK 005670 ** on success. Set *pRes to 0 if the cursor actually points to something 005671 ** or set *pRes to 1 if the table is empty. 005672 */ 005673 static SQLITE_NOINLINE int btreeLast(BtCursor *pCur, int *pRes){ 005674 int rc = moveToRoot(pCur); 005675 if( rc==SQLITE_OK ){ 005676 assert( pCur->eState==CURSOR_VALID ); 005677 *pRes = 0; 005678 rc = moveToRightmost(pCur); 005679 if( rc==SQLITE_OK ){ 005680 pCur->curFlags |= BTCF_AtLast; 005681 }else{ 005682 pCur->curFlags &= ~BTCF_AtLast; 005683 } 005684 }else if( rc==SQLITE_EMPTY ){ 005685 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005686 *pRes = 1; 005687 rc = SQLITE_OK; 005688 } 005689 return rc; 005690 } 005691 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 005692 assert( cursorOwnsBtShared(pCur) ); 005693 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005694 005695 /* If the cursor already points to the last entry, this is a no-op. */ 005696 if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){ 005697 assert( cursorIsAtLastEntry(pCur) || CORRUPT_DB ); 005698 *pRes = 0; 005699 return SQLITE_OK; 005700 } 005701 return btreeLast(pCur, pRes); 005702 } 005703 005704 /* Move the cursor so that it points to an entry in a table (a.k.a INTKEY) 005705 ** table near the key intKey. Return a success code. 005706 ** 005707 ** If an exact match is not found, then the cursor is always 005708 ** left pointing at a leaf page which would hold the entry if it 005709 ** were present. The cursor might point to an entry that comes 005710 ** before or after the key. 005711 ** 005712 ** An integer is written into *pRes which is the result of 005713 ** comparing the key with the entry to which the cursor is 005714 ** pointing. The meaning of the integer written into 005715 ** *pRes is as follows: 005716 ** 005717 ** *pRes<0 The cursor is left pointing at an entry that 005718 ** is smaller than intKey or if the table is empty 005719 ** and the cursor is therefore left point to nothing. 005720 ** 005721 ** *pRes==0 The cursor is left pointing at an entry that 005722 ** exactly matches intKey. 005723 ** 005724 ** *pRes>0 The cursor is left pointing at an entry that 005725 ** is larger than intKey. 005726 */ 005727 int sqlite3BtreeTableMoveto( 005728 BtCursor *pCur, /* The cursor to be moved */ 005729 i64 intKey, /* The table key */ 005730 int biasRight, /* If true, bias the search to the high end */ 005731 int *pRes /* Write search results here */ 005732 ){ 005733 int rc; 005734 005735 assert( cursorOwnsBtShared(pCur) ); 005736 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005737 assert( pRes ); 005738 assert( pCur->pKeyInfo==0 ); 005739 assert( pCur->eState!=CURSOR_VALID || pCur->curIntKey!=0 ); 005740 005741 /* If the cursor is already positioned at the point we are trying 005742 ** to move to, then just return without doing any work */ 005743 if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0 ){ 005744 if( pCur->info.nKey==intKey ){ 005745 *pRes = 0; 005746 return SQLITE_OK; 005747 } 005748 if( pCur->info.nKey<intKey ){ 005749 if( (pCur->curFlags & BTCF_AtLast)!=0 ){ 005750 assert( cursorIsAtLastEntry(pCur) || CORRUPT_DB ); 005751 *pRes = -1; 005752 return SQLITE_OK; 005753 } 005754 /* If the requested key is one more than the previous key, then 005755 ** try to get there using sqlite3BtreeNext() rather than a full 005756 ** binary search. This is an optimization only. The correct answer 005757 ** is still obtained without this case, only a little more slowly. */ 005758 if( pCur->info.nKey+1==intKey ){ 005759 *pRes = 0; 005760 rc = sqlite3BtreeNext(pCur, 0); 005761 if( rc==SQLITE_OK ){ 005762 getCellInfo(pCur); 005763 if( pCur->info.nKey==intKey ){ 005764 return SQLITE_OK; 005765 } 005766 }else if( rc!=SQLITE_DONE ){ 005767 return rc; 005768 } 005769 } 005770 } 005771 } 005772 005773 #ifdef SQLITE_DEBUG 005774 pCur->pBtree->nSeek++; /* Performance measurement during testing */ 005775 #endif 005776 005777 rc = moveToRoot(pCur); 005778 if( rc ){ 005779 if( rc==SQLITE_EMPTY ){ 005780 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005781 *pRes = -1; 005782 return SQLITE_OK; 005783 } 005784 return rc; 005785 } 005786 assert( pCur->pPage ); 005787 assert( pCur->pPage->isInit ); 005788 assert( pCur->eState==CURSOR_VALID ); 005789 assert( pCur->pPage->nCell > 0 ); 005790 assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey ); 005791 assert( pCur->curIntKey ); 005792 005793 for(;;){ 005794 int lwr, upr, idx, c; 005795 Pgno chldPg; 005796 MemPage *pPage = pCur->pPage; 005797 u8 *pCell; /* Pointer to current cell in pPage */ 005798 005799 /* pPage->nCell must be greater than zero. If this is the root-page 005800 ** the cursor would have been INVALID above and this for(;;) loop 005801 ** not run. If this is not the root-page, then the moveToChild() routine 005802 ** would have already detected db corruption. Similarly, pPage must 005803 ** be the right kind (index or table) of b-tree page. Otherwise 005804 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 005805 assert( pPage->nCell>0 ); 005806 assert( pPage->intKey ); 005807 lwr = 0; 005808 upr = pPage->nCell-1; 005809 assert( biasRight==0 || biasRight==1 ); 005810 idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */ 005811 for(;;){ 005812 i64 nCellKey; 005813 pCell = findCellPastPtr(pPage, idx); 005814 if( pPage->intKeyLeaf ){ 005815 while( 0x80 <= *(pCell++) ){ 005816 if( pCell>=pPage->aDataEnd ){ 005817 return SQLITE_CORRUPT_PAGE(pPage); 005818 } 005819 } 005820 } 005821 getVarint(pCell, (u64*)&nCellKey); 005822 if( nCellKey<intKey ){ 005823 lwr = idx+1; 005824 if( lwr>upr ){ c = -1; break; } 005825 }else if( nCellKey>intKey ){ 005826 upr = idx-1; 005827 if( lwr>upr ){ c = +1; break; } 005828 }else{ 005829 assert( nCellKey==intKey ); 005830 pCur->ix = (u16)idx; 005831 if( !pPage->leaf ){ 005832 lwr = idx; 005833 goto moveto_table_next_layer; 005834 }else{ 005835 pCur->curFlags |= BTCF_ValidNKey; 005836 pCur->info.nKey = nCellKey; 005837 pCur->info.nSize = 0; 005838 *pRes = 0; 005839 return SQLITE_OK; 005840 } 005841 } 005842 assert( lwr+upr>=0 ); 005843 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */ 005844 } 005845 assert( lwr==upr+1 || !pPage->leaf ); 005846 assert( pPage->isInit ); 005847 if( pPage->leaf ){ 005848 assert( pCur->ix<pCur->pPage->nCell ); 005849 pCur->ix = (u16)idx; 005850 *pRes = c; 005851 rc = SQLITE_OK; 005852 goto moveto_table_finish; 005853 } 005854 moveto_table_next_layer: 005855 if( lwr>=pPage->nCell ){ 005856 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 005857 }else{ 005858 chldPg = get4byte(findCell(pPage, lwr)); 005859 } 005860 pCur->ix = (u16)lwr; 005861 rc = moveToChild(pCur, chldPg); 005862 if( rc ) break; 005863 } 005864 moveto_table_finish: 005865 pCur->info.nSize = 0; 005866 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 005867 return rc; 005868 } 005869 005870 /* 005871 ** Compare the "idx"-th cell on the page the cursor pCur is currently 005872 ** pointing to to pIdxKey using xRecordCompare. Return negative or 005873 ** zero if the cell is less than or equal pIdxKey. Return positive 005874 ** if unknown. 005875 ** 005876 ** Return value negative: Cell at pCur[idx] less than pIdxKey 005877 ** 005878 ** Return value is zero: Cell at pCur[idx] equals pIdxKey 005879 ** 005880 ** Return value positive: Nothing is known about the relationship 005881 ** of the cell at pCur[idx] and pIdxKey. 005882 ** 005883 ** This routine is part of an optimization. It is always safe to return 005884 ** a positive value as that will cause the optimization to be skipped. 005885 */ 005886 static int indexCellCompare( 005887 BtCursor *pCur, 005888 int idx, 005889 UnpackedRecord *pIdxKey, 005890 RecordCompare xRecordCompare 005891 ){ 005892 MemPage *pPage = pCur->pPage; 005893 int c; 005894 int nCell; /* Size of the pCell cell in bytes */ 005895 u8 *pCell = findCellPastPtr(pPage, idx); 005896 005897 nCell = pCell[0]; 005898 if( nCell<=pPage->max1bytePayload ){ 005899 /* This branch runs if the record-size field of the cell is a 005900 ** single byte varint and the record fits entirely on the main 005901 ** b-tree page. */ 005902 testcase( pCell+nCell+1==pPage->aDataEnd ); 005903 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 005904 }else if( !(pCell[1] & 0x80) 005905 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 005906 ){ 005907 /* The record-size field is a 2 byte varint and the record 005908 ** fits entirely on the main b-tree page. */ 005909 testcase( pCell+nCell+2==pPage->aDataEnd ); 005910 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 005911 }else{ 005912 /* If the record extends into overflow pages, do not attempt 005913 ** the optimization. */ 005914 c = 99; 005915 } 005916 return c; 005917 } 005918 005919 /* 005920 ** Return true (non-zero) if pCur is current pointing to the last 005921 ** page of a table. 005922 */ 005923 static int cursorOnLastPage(BtCursor *pCur){ 005924 int i; 005925 assert( pCur->eState==CURSOR_VALID ); 005926 for(i=0; i<pCur->iPage; i++){ 005927 MemPage *pPage = pCur->apPage[i]; 005928 if( pCur->aiIdx[i]<pPage->nCell ) return 0; 005929 } 005930 return 1; 005931 } 005932 005933 /* Move the cursor so that it points to an entry in an index table 005934 ** near the key pIdxKey. Return a success code. 005935 ** 005936 ** If an exact match is not found, then the cursor is always 005937 ** left pointing at a leaf page which would hold the entry if it 005938 ** were present. The cursor might point to an entry that comes 005939 ** before or after the key. 005940 ** 005941 ** An integer is written into *pRes which is the result of 005942 ** comparing the key with the entry to which the cursor is 005943 ** pointing. The meaning of the integer written into 005944 ** *pRes is as follows: 005945 ** 005946 ** *pRes<0 The cursor is left pointing at an entry that 005947 ** is smaller than pIdxKey or if the table is empty 005948 ** and the cursor is therefore left point to nothing. 005949 ** 005950 ** *pRes==0 The cursor is left pointing at an entry that 005951 ** exactly matches pIdxKey. 005952 ** 005953 ** *pRes>0 The cursor is left pointing at an entry that 005954 ** is larger than pIdxKey. 005955 ** 005956 ** The pIdxKey->eqSeen field is set to 1 if there 005957 ** exists an entry in the table that exactly matches pIdxKey. 005958 */ 005959 int sqlite3BtreeIndexMoveto( 005960 BtCursor *pCur, /* The cursor to be moved */ 005961 UnpackedRecord *pIdxKey, /* Unpacked index key */ 005962 int *pRes /* Write search results here */ 005963 ){ 005964 int rc; 005965 RecordCompare xRecordCompare; 005966 005967 assert( cursorOwnsBtShared(pCur) ); 005968 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005969 assert( pRes ); 005970 assert( pCur->pKeyInfo!=0 ); 005971 005972 #ifdef SQLITE_DEBUG 005973 pCur->pBtree->nSeek++; /* Performance measurement during testing */ 005974 #endif 005975 005976 xRecordCompare = sqlite3VdbeFindCompare(pIdxKey); 005977 pIdxKey->errCode = 0; 005978 assert( pIdxKey->default_rc==1 005979 || pIdxKey->default_rc==0 005980 || pIdxKey->default_rc==-1 005981 ); 005982 005983 005984 /* Check to see if we can skip a lot of work. Two cases: 005985 ** 005986 ** (1) If the cursor is already pointing to the very last cell 005987 ** in the table and the pIdxKey search key is greater than or 005988 ** equal to that last cell, then no movement is required. 005989 ** 005990 ** (2) If the cursor is on the last page of the table and the first 005991 ** cell on that last page is less than or equal to the pIdxKey 005992 ** search key, then we can start the search on the current page 005993 ** without needing to go back to root. 005994 */ 005995 if( pCur->eState==CURSOR_VALID 005996 && pCur->pPage->leaf 005997 && cursorOnLastPage(pCur) 005998 ){ 005999 int c; 006000 if( pCur->ix==pCur->pPage->nCell-1 006001 && (c = indexCellCompare(pCur, pCur->ix, pIdxKey, xRecordCompare))<=0 006002 && pIdxKey->errCode==SQLITE_OK 006003 ){ 006004 *pRes = c; 006005 return SQLITE_OK; /* Cursor already pointing at the correct spot */ 006006 } 006007 if( pCur->iPage>0 006008 && indexCellCompare(pCur, 0, pIdxKey, xRecordCompare)<=0 006009 && pIdxKey->errCode==SQLITE_OK 006010 ){ 006011 pCur->curFlags &= ~(BTCF_ValidOvfl|BTCF_AtLast); 006012 if( !pCur->pPage->isInit ){ 006013 return SQLITE_CORRUPT_BKPT; 006014 } 006015 goto bypass_moveto_root; /* Start search on the current page */ 006016 } 006017 pIdxKey->errCode = SQLITE_OK; 006018 } 006019 006020 rc = moveToRoot(pCur); 006021 if( rc ){ 006022 if( rc==SQLITE_EMPTY ){ 006023 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 006024 *pRes = -1; 006025 return SQLITE_OK; 006026 } 006027 return rc; 006028 } 006029 006030 bypass_moveto_root: 006031 assert( pCur->pPage ); 006032 assert( pCur->pPage->isInit ); 006033 assert( pCur->eState==CURSOR_VALID ); 006034 assert( pCur->pPage->nCell > 0 ); 006035 assert( pCur->curIntKey==0 ); 006036 assert( pIdxKey!=0 ); 006037 for(;;){ 006038 int lwr, upr, idx, c; 006039 Pgno chldPg; 006040 MemPage *pPage = pCur->pPage; 006041 u8 *pCell; /* Pointer to current cell in pPage */ 006042 006043 /* pPage->nCell must be greater than zero. If this is the root-page 006044 ** the cursor would have been INVALID above and this for(;;) loop 006045 ** not run. If this is not the root-page, then the moveToChild() routine 006046 ** would have already detected db corruption. Similarly, pPage must 006047 ** be the right kind (index or table) of b-tree page. Otherwise 006048 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 006049 assert( pPage->nCell>0 ); 006050 assert( pPage->intKey==0 ); 006051 lwr = 0; 006052 upr = pPage->nCell-1; 006053 idx = upr>>1; /* idx = (lwr+upr)/2; */ 006054 for(;;){ 006055 int nCell; /* Size of the pCell cell in bytes */ 006056 pCell = findCellPastPtr(pPage, idx); 006057 006058 /* The maximum supported page-size is 65536 bytes. This means that 006059 ** the maximum number of record bytes stored on an index B-Tree 006060 ** page is less than 16384 bytes and may be stored as a 2-byte 006061 ** varint. This information is used to attempt to avoid parsing 006062 ** the entire cell by checking for the cases where the record is 006063 ** stored entirely within the b-tree page by inspecting the first 006064 ** 2 bytes of the cell. 006065 */ 006066 nCell = pCell[0]; 006067 if( nCell<=pPage->max1bytePayload ){ 006068 /* This branch runs if the record-size field of the cell is a 006069 ** single byte varint and the record fits entirely on the main 006070 ** b-tree page. */ 006071 testcase( pCell+nCell+1==pPage->aDataEnd ); 006072 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 006073 }else if( !(pCell[1] & 0x80) 006074 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 006075 ){ 006076 /* The record-size field is a 2 byte varint and the record 006077 ** fits entirely on the main b-tree page. */ 006078 testcase( pCell+nCell+2==pPage->aDataEnd ); 006079 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 006080 }else{ 006081 /* The record flows over onto one or more overflow pages. In 006082 ** this case the whole cell needs to be parsed, a buffer allocated 006083 ** and accessPayload() used to retrieve the record into the 006084 ** buffer before VdbeRecordCompare() can be called. 006085 ** 006086 ** If the record is corrupt, the xRecordCompare routine may read 006087 ** up to two varints past the end of the buffer. An extra 18 006088 ** bytes of padding is allocated at the end of the buffer in 006089 ** case this happens. */ 006090 void *pCellKey; 006091 u8 * const pCellBody = pCell - pPage->childPtrSize; 006092 const int nOverrun = 18; /* Size of the overrun padding */ 006093 pPage->xParseCell(pPage, pCellBody, &pCur->info); 006094 nCell = (int)pCur->info.nKey; 006095 testcase( nCell<0 ); /* True if key size is 2^32 or more */ 006096 testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */ 006097 testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */ 006098 testcase( nCell==2 ); /* Minimum legal index key size */ 006099 if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){ 006100 rc = SQLITE_CORRUPT_PAGE(pPage); 006101 goto moveto_index_finish; 006102 } 006103 pCellKey = sqlite3Malloc( nCell+nOverrun ); 006104 if( pCellKey==0 ){ 006105 rc = SQLITE_NOMEM_BKPT; 006106 goto moveto_index_finish; 006107 } 006108 pCur->ix = (u16)idx; 006109 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0); 006110 memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */ 006111 pCur->curFlags &= ~BTCF_ValidOvfl; 006112 if( rc ){ 006113 sqlite3_free(pCellKey); 006114 goto moveto_index_finish; 006115 } 006116 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey); 006117 sqlite3_free(pCellKey); 006118 } 006119 assert( 006120 (pIdxKey->errCode!=SQLITE_CORRUPT || c==0) 006121 && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed) 006122 ); 006123 if( c<0 ){ 006124 lwr = idx+1; 006125 }else if( c>0 ){ 006126 upr = idx-1; 006127 }else{ 006128 assert( c==0 ); 006129 *pRes = 0; 006130 rc = SQLITE_OK; 006131 pCur->ix = (u16)idx; 006132 if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT; 006133 goto moveto_index_finish; 006134 } 006135 if( lwr>upr ) break; 006136 assert( lwr+upr>=0 ); 006137 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */ 006138 } 006139 assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) ); 006140 assert( pPage->isInit ); 006141 if( pPage->leaf ){ 006142 assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB ); 006143 pCur->ix = (u16)idx; 006144 *pRes = c; 006145 rc = SQLITE_OK; 006146 goto moveto_index_finish; 006147 } 006148 if( lwr>=pPage->nCell ){ 006149 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 006150 }else{ 006151 chldPg = get4byte(findCell(pPage, lwr)); 006152 } 006153 006154 /* This block is similar to an in-lined version of: 006155 ** 006156 ** pCur->ix = (u16)lwr; 006157 ** rc = moveToChild(pCur, chldPg); 006158 ** if( rc ) break; 006159 */ 006160 pCur->info.nSize = 0; 006161 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 006162 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 006163 return SQLITE_CORRUPT_BKPT; 006164 } 006165 pCur->aiIdx[pCur->iPage] = (u16)lwr; 006166 pCur->apPage[pCur->iPage] = pCur->pPage; 006167 pCur->ix = 0; 006168 pCur->iPage++; 006169 rc = getAndInitPage(pCur->pBt, chldPg, &pCur->pPage, pCur->curPagerFlags); 006170 if( rc==SQLITE_OK 006171 && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey) 006172 ){ 006173 releasePage(pCur->pPage); 006174 rc = SQLITE_CORRUPT_PGNO(chldPg); 006175 } 006176 if( rc ){ 006177 pCur->pPage = pCur->apPage[--pCur->iPage]; 006178 break; 006179 } 006180 /* 006181 ***** End of in-lined moveToChild() call */ 006182 } 006183 moveto_index_finish: 006184 pCur->info.nSize = 0; 006185 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 006186 return rc; 006187 } 006188 006189 006190 /* 006191 ** Return TRUE if the cursor is not pointing at an entry of the table. 006192 ** 006193 ** TRUE will be returned after a call to sqlite3BtreeNext() moves 006194 ** past the last entry in the table or sqlite3BtreePrev() moves past 006195 ** the first entry. TRUE is also returned if the table is empty. 006196 */ 006197 int sqlite3BtreeEof(BtCursor *pCur){ 006198 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 006199 ** have been deleted? This API will need to change to return an error code 006200 ** as well as the boolean result value. 006201 */ 006202 return (CURSOR_VALID!=pCur->eState); 006203 } 006204 006205 /* 006206 ** Return an estimate for the number of rows in the table that pCur is 006207 ** pointing to. Return a negative number if no estimate is currently 006208 ** available. 006209 */ 006210 i64 sqlite3BtreeRowCountEst(BtCursor *pCur){ 006211 i64 n; 006212 u8 i; 006213 006214 assert( cursorOwnsBtShared(pCur) ); 006215 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 006216 006217 /* Currently this interface is only called by the OP_IfSizeBetween 006218 ** opcode and the OP_Count opcode with P3=1. In either case, 006219 ** the cursor will always be valid unless the btree is empty. */ 006220 if( pCur->eState!=CURSOR_VALID ) return 0; 006221 if( NEVER(pCur->pPage->leaf==0) ) return -1; 006222 006223 n = pCur->pPage->nCell; 006224 for(i=0; i<pCur->iPage; i++){ 006225 n *= pCur->apPage[i]->nCell; 006226 } 006227 return n; 006228 } 006229 006230 /* 006231 ** Advance the cursor to the next entry in the database. 006232 ** Return value: 006233 ** 006234 ** SQLITE_OK success 006235 ** SQLITE_DONE cursor is already pointing at the last element 006236 ** otherwise some kind of error occurred 006237 ** 006238 ** The main entry point is sqlite3BtreeNext(). That routine is optimized 006239 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx 006240 ** to the next cell on the current page. The (slower) btreeNext() helper 006241 ** routine is called when it is necessary to move to a different page or 006242 ** to restore the cursor. 006243 ** 006244 ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the 006245 ** cursor corresponds to an SQL index and this routine could have been 006246 ** skipped if the SQL index had been a unique index. The F argument 006247 ** is a hint to the implement. SQLite btree implementation does not use 006248 ** this hint, but COMDB2 does. 006249 */ 006250 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){ 006251 int rc; 006252 int idx; 006253 MemPage *pPage; 006254 006255 assert( cursorOwnsBtShared(pCur) ); 006256 if( pCur->eState!=CURSOR_VALID ){ 006257 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 006258 rc = restoreCursorPosition(pCur); 006259 if( rc!=SQLITE_OK ){ 006260 return rc; 006261 } 006262 if( CURSOR_INVALID==pCur->eState ){ 006263 return SQLITE_DONE; 006264 } 006265 if( pCur->eState==CURSOR_SKIPNEXT ){ 006266 pCur->eState = CURSOR_VALID; 006267 if( pCur->skipNext>0 ) return SQLITE_OK; 006268 } 006269 } 006270 006271 pPage = pCur->pPage; 006272 idx = ++pCur->ix; 006273 if( sqlite3FaultSim(412) ) pPage->isInit = 0; 006274 if( !pPage->isInit ){ 006275 return SQLITE_CORRUPT_BKPT; 006276 } 006277 006278 if( idx>=pPage->nCell ){ 006279 if( !pPage->leaf ){ 006280 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 006281 if( rc ) return rc; 006282 return moveToLeftmost(pCur); 006283 } 006284 do{ 006285 if( pCur->iPage==0 ){ 006286 pCur->eState = CURSOR_INVALID; 006287 return SQLITE_DONE; 006288 } 006289 moveToParent(pCur); 006290 pPage = pCur->pPage; 006291 }while( pCur->ix>=pPage->nCell ); 006292 if( pPage->intKey ){ 006293 return sqlite3BtreeNext(pCur, 0); 006294 }else{ 006295 return SQLITE_OK; 006296 } 006297 } 006298 if( pPage->leaf ){ 006299 return SQLITE_OK; 006300 }else{ 006301 return moveToLeftmost(pCur); 006302 } 006303 } 006304 int sqlite3BtreeNext(BtCursor *pCur, int flags){ 006305 MemPage *pPage; 006306 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */ 006307 assert( cursorOwnsBtShared(pCur) ); 006308 assert( flags==0 || flags==1 ); 006309 pCur->info.nSize = 0; 006310 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 006311 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur); 006312 pPage = pCur->pPage; 006313 if( (++pCur->ix)>=pPage->nCell ){ 006314 pCur->ix--; 006315 return btreeNext(pCur); 006316 } 006317 if( pPage->leaf ){ 006318 return SQLITE_OK; 006319 }else{ 006320 return moveToLeftmost(pCur); 006321 } 006322 } 006323 006324 /* 006325 ** Step the cursor to the back to the previous entry in the database. 006326 ** Return values: 006327 ** 006328 ** SQLITE_OK success 006329 ** SQLITE_DONE the cursor is already on the first element of the table 006330 ** otherwise some kind of error occurred 006331 ** 006332 ** The main entry point is sqlite3BtreePrevious(). That routine is optimized 006333 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx 006334 ** to the previous cell on the current page. The (slower) btreePrevious() 006335 ** helper routine is called when it is necessary to move to a different page 006336 ** or to restore the cursor. 006337 ** 006338 ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then 006339 ** the cursor corresponds to an SQL index and this routine could have been 006340 ** skipped if the SQL index had been a unique index. The F argument is a 006341 ** hint to the implement. The native SQLite btree implementation does not 006342 ** use this hint, but COMDB2 does. 006343 */ 006344 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){ 006345 int rc; 006346 MemPage *pPage; 006347 006348 assert( cursorOwnsBtShared(pCur) ); 006349 assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 ); 006350 assert( pCur->info.nSize==0 ); 006351 if( pCur->eState!=CURSOR_VALID ){ 006352 rc = restoreCursorPosition(pCur); 006353 if( rc!=SQLITE_OK ){ 006354 return rc; 006355 } 006356 if( CURSOR_INVALID==pCur->eState ){ 006357 return SQLITE_DONE; 006358 } 006359 if( CURSOR_SKIPNEXT==pCur->eState ){ 006360 pCur->eState = CURSOR_VALID; 006361 if( pCur->skipNext<0 ) return SQLITE_OK; 006362 } 006363 } 006364 006365 pPage = pCur->pPage; 006366 if( sqlite3FaultSim(412) ) pPage->isInit = 0; 006367 if( !pPage->isInit ){ 006368 return SQLITE_CORRUPT_BKPT; 006369 } 006370 if( !pPage->leaf ){ 006371 int idx = pCur->ix; 006372 rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); 006373 if( rc ) return rc; 006374 rc = moveToRightmost(pCur); 006375 }else{ 006376 while( pCur->ix==0 ){ 006377 if( pCur->iPage==0 ){ 006378 pCur->eState = CURSOR_INVALID; 006379 return SQLITE_DONE; 006380 } 006381 moveToParent(pCur); 006382 } 006383 assert( pCur->info.nSize==0 ); 006384 assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 ); 006385 006386 pCur->ix--; 006387 pPage = pCur->pPage; 006388 if( pPage->intKey && !pPage->leaf ){ 006389 rc = sqlite3BtreePrevious(pCur, 0); 006390 }else{ 006391 rc = SQLITE_OK; 006392 } 006393 } 006394 return rc; 006395 } 006396 int sqlite3BtreePrevious(BtCursor *pCur, int flags){ 006397 assert( cursorOwnsBtShared(pCur) ); 006398 assert( flags==0 || flags==1 ); 006399 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */ 006400 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey); 006401 pCur->info.nSize = 0; 006402 if( pCur->eState!=CURSOR_VALID 006403 || pCur->ix==0 006404 || pCur->pPage->leaf==0 006405 ){ 006406 return btreePrevious(pCur); 006407 } 006408 pCur->ix--; 006409 return SQLITE_OK; 006410 } 006411 006412 /* 006413 ** Allocate a new page from the database file. 006414 ** 006415 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 006416 ** has already been called on the new page.) The new page has also 006417 ** been referenced and the calling routine is responsible for calling 006418 ** sqlite3PagerUnref() on the new page when it is done. 006419 ** 006420 ** SQLITE_OK is returned on success. Any other return value indicates 006421 ** an error. *ppPage is set to NULL in the event of an error. 006422 ** 006423 ** If the "nearby" parameter is not 0, then an effort is made to 006424 ** locate a page close to the page number "nearby". This can be used in an 006425 ** attempt to keep related pages close to each other in the database file, 006426 ** which in turn can make database access faster. 006427 ** 006428 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists 006429 ** anywhere on the free-list, then it is guaranteed to be returned. If 006430 ** eMode is BTALLOC_LT then the page returned will be less than or equal 006431 ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there 006432 ** are no restrictions on which page is returned. 006433 */ 006434 static int allocateBtreePage( 006435 BtShared *pBt, /* The btree */ 006436 MemPage **ppPage, /* Store pointer to the allocated page here */ 006437 Pgno *pPgno, /* Store the page number here */ 006438 Pgno nearby, /* Search for a page near this one */ 006439 u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */ 006440 ){ 006441 MemPage *pPage1; 006442 int rc; 006443 u32 n; /* Number of pages on the freelist */ 006444 u32 k; /* Number of leaves on the trunk of the freelist */ 006445 MemPage *pTrunk = 0; 006446 MemPage *pPrevTrunk = 0; 006447 Pgno mxPage; /* Total size of the database file */ 006448 006449 assert( sqlite3_mutex_held(pBt->mutex) ); 006450 assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) ); 006451 pPage1 = pBt->pPage1; 006452 mxPage = btreePagecount(pBt); 006453 /* EVIDENCE-OF: R-21003-45125 The 4-byte big-endian integer at offset 36 006454 ** stores the total number of pages on the freelist. */ 006455 n = get4byte(&pPage1->aData[36]); 006456 testcase( n==mxPage-1 ); 006457 if( n>=mxPage ){ 006458 return SQLITE_CORRUPT_BKPT; 006459 } 006460 if( n>0 ){ 006461 /* There are pages on the freelist. Reuse one of those pages. */ 006462 Pgno iTrunk; 006463 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 006464 u32 nSearch = 0; /* Count of the number of search attempts */ 006465 006466 /* If eMode==BTALLOC_EXACT and a query of the pointer-map 006467 ** shows that the page 'nearby' is somewhere on the free-list, then 006468 ** the entire-list will be searched for that page. 006469 */ 006470 #ifndef SQLITE_OMIT_AUTOVACUUM 006471 if( eMode==BTALLOC_EXACT ){ 006472 if( nearby<=mxPage ){ 006473 u8 eType; 006474 assert( nearby>0 ); 006475 assert( pBt->autoVacuum ); 006476 rc = ptrmapGet(pBt, nearby, &eType, 0); 006477 if( rc ) return rc; 006478 if( eType==PTRMAP_FREEPAGE ){ 006479 searchList = 1; 006480 } 006481 } 006482 }else if( eMode==BTALLOC_LE ){ 006483 searchList = 1; 006484 } 006485 #endif 006486 006487 /* Decrement the free-list count by 1. Set iTrunk to the index of the 006488 ** first free-list trunk page. iPrevTrunk is initially 1. 006489 */ 006490 rc = sqlite3PagerWrite(pPage1->pDbPage); 006491 if( rc ) return rc; 006492 put4byte(&pPage1->aData[36], n-1); 006493 006494 /* The code within this loop is run only once if the 'searchList' variable 006495 ** is not true. Otherwise, it runs once for each trunk-page on the 006496 ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT) 006497 ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT) 006498 */ 006499 do { 006500 pPrevTrunk = pTrunk; 006501 if( pPrevTrunk ){ 006502 /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page 006503 ** is the page number of the next freelist trunk page in the list or 006504 ** zero if this is the last freelist trunk page. */ 006505 iTrunk = get4byte(&pPrevTrunk->aData[0]); 006506 }else{ 006507 /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32 006508 ** stores the page number of the first page of the freelist, or zero if 006509 ** the freelist is empty. */ 006510 iTrunk = get4byte(&pPage1->aData[32]); 006511 } 006512 testcase( iTrunk==mxPage ); 006513 if( iTrunk>mxPage || nSearch++ > n ){ 006514 rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1); 006515 }else{ 006516 rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0); 006517 } 006518 if( rc ){ 006519 pTrunk = 0; 006520 goto end_allocate_page; 006521 } 006522 assert( pTrunk!=0 ); 006523 assert( pTrunk->aData!=0 ); 006524 /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page 006525 ** is the number of leaf page pointers to follow. */ 006526 k = get4byte(&pTrunk->aData[4]); 006527 if( k==0 && !searchList ){ 006528 /* The trunk has no leaves and the list is not being searched. 006529 ** So extract the trunk page itself and use it as the newly 006530 ** allocated page */ 006531 assert( pPrevTrunk==0 ); 006532 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006533 if( rc ){ 006534 goto end_allocate_page; 006535 } 006536 *pPgno = iTrunk; 006537 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 006538 *ppPage = pTrunk; 006539 pTrunk = 0; 006540 TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1)); 006541 }else if( k>(u32)(pBt->usableSize/4 - 2) ){ 006542 /* Value of k is out of range. Database corruption */ 006543 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006544 goto end_allocate_page; 006545 #ifndef SQLITE_OMIT_AUTOVACUUM 006546 }else if( searchList 006547 && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE)) 006548 ){ 006549 /* The list is being searched and this trunk page is the page 006550 ** to allocate, regardless of whether it has leaves. 006551 */ 006552 *pPgno = iTrunk; 006553 *ppPage = pTrunk; 006554 searchList = 0; 006555 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006556 if( rc ){ 006557 goto end_allocate_page; 006558 } 006559 if( k==0 ){ 006560 if( !pPrevTrunk ){ 006561 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 006562 }else{ 006563 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 006564 if( rc!=SQLITE_OK ){ 006565 goto end_allocate_page; 006566 } 006567 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 006568 } 006569 }else{ 006570 /* The trunk page is required by the caller but it contains 006571 ** pointers to free-list leaves. The first leaf becomes a trunk 006572 ** page in this case. 006573 */ 006574 MemPage *pNewTrunk; 006575 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 006576 if( iNewTrunk>mxPage ){ 006577 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006578 goto end_allocate_page; 006579 } 006580 testcase( iNewTrunk==mxPage ); 006581 rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0); 006582 if( rc!=SQLITE_OK ){ 006583 goto end_allocate_page; 006584 } 006585 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 006586 if( rc!=SQLITE_OK ){ 006587 releasePage(pNewTrunk); 006588 goto end_allocate_page; 006589 } 006590 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 006591 put4byte(&pNewTrunk->aData[4], k-1); 006592 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 006593 releasePage(pNewTrunk); 006594 if( !pPrevTrunk ){ 006595 assert( sqlite3PagerIswriteable(pPage1->pDbPage) ); 006596 put4byte(&pPage1->aData[32], iNewTrunk); 006597 }else{ 006598 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 006599 if( rc ){ 006600 goto end_allocate_page; 006601 } 006602 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 006603 } 006604 } 006605 pTrunk = 0; 006606 TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1)); 006607 #endif 006608 }else if( k>0 ){ 006609 /* Extract a leaf from the trunk */ 006610 u32 closest; 006611 Pgno iPage; 006612 unsigned char *aData = pTrunk->aData; 006613 if( nearby>0 ){ 006614 u32 i; 006615 closest = 0; 006616 if( eMode==BTALLOC_LE ){ 006617 for(i=0; i<k; i++){ 006618 iPage = get4byte(&aData[8+i*4]); 006619 if( iPage<=nearby ){ 006620 closest = i; 006621 break; 006622 } 006623 } 006624 }else{ 006625 int dist; 006626 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby); 006627 for(i=1; i<k; i++){ 006628 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby); 006629 if( d2<dist ){ 006630 closest = i; 006631 dist = d2; 006632 } 006633 } 006634 } 006635 }else{ 006636 closest = 0; 006637 } 006638 006639 iPage = get4byte(&aData[8+closest*4]); 006640 testcase( iPage==mxPage ); 006641 if( iPage>mxPage || iPage<2 ){ 006642 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006643 goto end_allocate_page; 006644 } 006645 testcase( iPage==mxPage ); 006646 if( !searchList 006647 || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE)) 006648 ){ 006649 int noContent; 006650 *pPgno = iPage; 006651 TRACE(("ALLOCATE: %u was leaf %u of %u on trunk %u" 006652 ": %u more free pages\n", 006653 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 006654 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006655 if( rc ) goto end_allocate_page; 006656 if( closest<k-1 ){ 006657 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 006658 } 006659 put4byte(&aData[4], k-1); 006660 noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0; 006661 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent); 006662 if( rc==SQLITE_OK ){ 006663 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 006664 if( rc!=SQLITE_OK ){ 006665 releasePage(*ppPage); 006666 *ppPage = 0; 006667 } 006668 } 006669 searchList = 0; 006670 } 006671 } 006672 releasePage(pPrevTrunk); 006673 pPrevTrunk = 0; 006674 }while( searchList ); 006675 }else{ 006676 /* There are no pages on the freelist, so append a new page to the 006677 ** database image. 006678 ** 006679 ** Normally, new pages allocated by this block can be requested from the 006680 ** pager layer with the 'no-content' flag set. This prevents the pager 006681 ** from trying to read the pages content from disk. However, if the 006682 ** current transaction has already run one or more incremental-vacuum 006683 ** steps, then the page we are about to allocate may contain content 006684 ** that is required in the event of a rollback. In this case, do 006685 ** not set the no-content flag. This causes the pager to load and journal 006686 ** the current page content before overwriting it. 006687 ** 006688 ** Note that the pager will not actually attempt to load or journal 006689 ** content for any page that really does lie past the end of the database 006690 ** file on disk. So the effects of disabling the no-content optimization 006691 ** here are confined to those pages that lie between the end of the 006692 ** database image and the end of the database file. 006693 */ 006694 int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0; 006695 006696 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 006697 if( rc ) return rc; 006698 pBt->nPage++; 006699 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++; 006700 006701 #ifndef SQLITE_OMIT_AUTOVACUUM 006702 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){ 006703 /* If *pPgno refers to a pointer-map page, allocate two new pages 006704 ** at the end of the file instead of one. The first allocated page 006705 ** becomes a new pointer-map page, the second is used by the caller. 006706 */ 006707 MemPage *pPg = 0; 006708 TRACE(("ALLOCATE: %u from end of file (pointer-map page)\n", pBt->nPage)); 006709 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) ); 006710 rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent); 006711 if( rc==SQLITE_OK ){ 006712 rc = sqlite3PagerWrite(pPg->pDbPage); 006713 releasePage(pPg); 006714 } 006715 if( rc ) return rc; 006716 pBt->nPage++; 006717 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; } 006718 } 006719 #endif 006720 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage); 006721 *pPgno = pBt->nPage; 006722 006723 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 006724 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent); 006725 if( rc ) return rc; 006726 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 006727 if( rc!=SQLITE_OK ){ 006728 releasePage(*ppPage); 006729 *ppPage = 0; 006730 } 006731 TRACE(("ALLOCATE: %u from end of file\n", *pPgno)); 006732 } 006733 006734 assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) ); 006735 006736 end_allocate_page: 006737 releasePage(pTrunk); 006738 releasePage(pPrevTrunk); 006739 assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 ); 006740 assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 ); 006741 return rc; 006742 } 006743 006744 /* 006745 ** This function is used to add page iPage to the database file free-list. 006746 ** It is assumed that the page is not already a part of the free-list. 006747 ** 006748 ** The value passed as the second argument to this function is optional. 006749 ** If the caller happens to have a pointer to the MemPage object 006750 ** corresponding to page iPage handy, it may pass it as the second value. 006751 ** Otherwise, it may pass NULL. 006752 ** 006753 ** If a pointer to a MemPage object is passed as the second argument, 006754 ** its reference count is not altered by this function. 006755 */ 006756 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ 006757 MemPage *pTrunk = 0; /* Free-list trunk page */ 006758 Pgno iTrunk = 0; /* Page number of free-list trunk page */ 006759 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ 006760 MemPage *pPage; /* Page being freed. May be NULL. */ 006761 int rc; /* Return Code */ 006762 u32 nFree; /* Initial number of pages on free-list */ 006763 006764 assert( sqlite3_mutex_held(pBt->mutex) ); 006765 assert( CORRUPT_DB || iPage>1 ); 006766 assert( !pMemPage || pMemPage->pgno==iPage ); 006767 006768 if( iPage<2 || iPage>pBt->nPage ){ 006769 return SQLITE_CORRUPT_BKPT; 006770 } 006771 if( pMemPage ){ 006772 pPage = pMemPage; 006773 sqlite3PagerRef(pPage->pDbPage); 006774 }else{ 006775 pPage = btreePageLookup(pBt, iPage); 006776 } 006777 006778 /* Increment the free page count on pPage1 */ 006779 rc = sqlite3PagerWrite(pPage1->pDbPage); 006780 if( rc ) goto freepage_out; 006781 nFree = get4byte(&pPage1->aData[36]); 006782 put4byte(&pPage1->aData[36], nFree+1); 006783 006784 if( pBt->btsFlags & BTS_SECURE_DELETE ){ 006785 /* If the secure_delete option is enabled, then 006786 ** always fully overwrite deleted information with zeros. 006787 */ 006788 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) ) 006789 || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0) 006790 ){ 006791 goto freepage_out; 006792 } 006793 memset(pPage->aData, 0, pPage->pBt->pageSize); 006794 } 006795 006796 /* If the database supports auto-vacuum, write an entry in the pointer-map 006797 ** to indicate that the page is free. 006798 */ 006799 if( ISAUTOVACUUM(pBt) ){ 006800 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc); 006801 if( rc ) goto freepage_out; 006802 } 006803 006804 /* Now manipulate the actual database free-list structure. There are two 006805 ** possibilities. If the free-list is currently empty, or if the first 006806 ** trunk page in the free-list is full, then this page will become a 006807 ** new free-list trunk page. Otherwise, it will become a leaf of the 006808 ** first trunk page in the current free-list. This block tests if it 006809 ** is possible to add the page as a new free-list leaf. 006810 */ 006811 if( nFree!=0 ){ 006812 u32 nLeaf; /* Initial number of leaf cells on trunk page */ 006813 006814 iTrunk = get4byte(&pPage1->aData[32]); 006815 if( iTrunk>btreePagecount(pBt) ){ 006816 rc = SQLITE_CORRUPT_BKPT; 006817 goto freepage_out; 006818 } 006819 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 006820 if( rc!=SQLITE_OK ){ 006821 goto freepage_out; 006822 } 006823 006824 nLeaf = get4byte(&pTrunk->aData[4]); 006825 assert( pBt->usableSize>32 ); 006826 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){ 006827 rc = SQLITE_CORRUPT_BKPT; 006828 goto freepage_out; 006829 } 006830 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){ 006831 /* In this case there is room on the trunk page to insert the page 006832 ** being freed as a new leaf. 006833 ** 006834 ** Note that the trunk page is not really full until it contains 006835 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have 006836 ** coded. But due to a coding error in versions of SQLite prior to 006837 ** 3.6.0, databases with freelist trunk pages holding more than 006838 ** usableSize/4 - 8 entries will be reported as corrupt. In order 006839 ** to maintain backwards compatibility with older versions of SQLite, 006840 ** we will continue to restrict the number of entries to usableSize/4 - 8 006841 ** for now. At some point in the future (once everyone has upgraded 006842 ** to 3.6.0 or later) we should consider fixing the conditional above 006843 ** to read "usableSize/4-2" instead of "usableSize/4-8". 006844 ** 006845 ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still 006846 ** avoid using the last six entries in the freelist trunk page array in 006847 ** order that database files created by newer versions of SQLite can be 006848 ** read by older versions of SQLite. 006849 */ 006850 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006851 if( rc==SQLITE_OK ){ 006852 put4byte(&pTrunk->aData[4], nLeaf+1); 006853 put4byte(&pTrunk->aData[8+nLeaf*4], iPage); 006854 if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){ 006855 sqlite3PagerDontWrite(pPage->pDbPage); 006856 } 006857 rc = btreeSetHasContent(pBt, iPage); 006858 } 006859 TRACE(("FREE-PAGE: %u leaf on trunk page %u\n",pPage->pgno,pTrunk->pgno)); 006860 goto freepage_out; 006861 } 006862 } 006863 006864 /* If control flows to this point, then it was not possible to add the 006865 ** the page being freed as a leaf page of the first trunk in the free-list. 006866 ** Possibly because the free-list is empty, or possibly because the 006867 ** first trunk in the free-list is full. Either way, the page being freed 006868 ** will become the new first trunk page in the free-list. 006869 */ 006870 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){ 006871 goto freepage_out; 006872 } 006873 rc = sqlite3PagerWrite(pPage->pDbPage); 006874 if( rc!=SQLITE_OK ){ 006875 goto freepage_out; 006876 } 006877 put4byte(pPage->aData, iTrunk); 006878 put4byte(&pPage->aData[4], 0); 006879 put4byte(&pPage1->aData[32], iPage); 006880 TRACE(("FREE-PAGE: %u new trunk page replacing %u\n", pPage->pgno, iTrunk)); 006881 006882 freepage_out: 006883 if( pPage ){ 006884 pPage->isInit = 0; 006885 } 006886 releasePage(pPage); 006887 releasePage(pTrunk); 006888 return rc; 006889 } 006890 static void freePage(MemPage *pPage, int *pRC){ 006891 if( (*pRC)==SQLITE_OK ){ 006892 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno); 006893 } 006894 } 006895 006896 /* 006897 ** Free the overflow pages associated with the given Cell. 006898 */ 006899 static SQLITE_NOINLINE int clearCellOverflow( 006900 MemPage *pPage, /* The page that contains the Cell */ 006901 unsigned char *pCell, /* First byte of the Cell */ 006902 CellInfo *pInfo /* Size information about the cell */ 006903 ){ 006904 BtShared *pBt; 006905 Pgno ovflPgno; 006906 int rc; 006907 int nOvfl; 006908 u32 ovflPageSize; 006909 006910 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 006911 assert( pInfo->nLocal!=pInfo->nPayload ); 006912 testcase( pCell + pInfo->nSize == pPage->aDataEnd ); 006913 testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd ); 006914 if( pCell + pInfo->nSize > pPage->aDataEnd ){ 006915 /* Cell extends past end of page */ 006916 return SQLITE_CORRUPT_PAGE(pPage); 006917 } 006918 ovflPgno = get4byte(pCell + pInfo->nSize - 4); 006919 pBt = pPage->pBt; 006920 assert( pBt->usableSize > 4 ); 006921 ovflPageSize = pBt->usableSize - 4; 006922 nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize; 006923 assert( nOvfl>0 || 006924 (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize) 006925 ); 006926 while( nOvfl-- ){ 006927 Pgno iNext = 0; 006928 MemPage *pOvfl = 0; 006929 if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){ 006930 /* 0 is not a legal page number and page 1 cannot be an 006931 ** overflow page. Therefore if ovflPgno<2 or past the end of the 006932 ** file the database must be corrupt. */ 006933 return SQLITE_CORRUPT_BKPT; 006934 } 006935 if( nOvfl ){ 006936 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); 006937 if( rc ) return rc; 006938 } 006939 006940 if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) ) 006941 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1 006942 ){ 006943 /* There is no reason any cursor should have an outstanding reference 006944 ** to an overflow page belonging to a cell that is being deleted/updated. 006945 ** So if there exists more than one reference to this page, then it 006946 ** must not really be an overflow page and the database must be corrupt. 006947 ** It is helpful to detect this before calling freePage2(), as 006948 ** freePage2() may zero the page contents if secure-delete mode is 006949 ** enabled. If this 'overflow' page happens to be a page that the 006950 ** caller is iterating through or using in some other way, this 006951 ** can be problematic. 006952 */ 006953 rc = SQLITE_CORRUPT_BKPT; 006954 }else{ 006955 rc = freePage2(pBt, pOvfl, ovflPgno); 006956 } 006957 006958 if( pOvfl ){ 006959 sqlite3PagerUnref(pOvfl->pDbPage); 006960 } 006961 if( rc ) return rc; 006962 ovflPgno = iNext; 006963 } 006964 return SQLITE_OK; 006965 } 006966 006967 /* Call xParseCell to compute the size of a cell. If the cell contains 006968 ** overflow, then invoke cellClearOverflow to clear out that overflow. 006969 ** Store the result code (SQLITE_OK or some error code) in rc. 006970 ** 006971 ** Implemented as macro to force inlining for performance. 006972 */ 006973 #define BTREE_CLEAR_CELL(rc, pPage, pCell, sInfo) \ 006974 pPage->xParseCell(pPage, pCell, &sInfo); \ 006975 if( sInfo.nLocal!=sInfo.nPayload ){ \ 006976 rc = clearCellOverflow(pPage, pCell, &sInfo); \ 006977 }else{ \ 006978 rc = SQLITE_OK; \ 006979 } 006980 006981 006982 /* 006983 ** Create the byte sequence used to represent a cell on page pPage 006984 ** and write that byte sequence into pCell[]. Overflow pages are 006985 ** allocated and filled in as necessary. The calling procedure 006986 ** is responsible for making sure sufficient space has been allocated 006987 ** for pCell[]. 006988 ** 006989 ** Note that pCell does not necessary need to point to the pPage->aData 006990 ** area. pCell might point to some temporary storage. The cell will 006991 ** be constructed in this temporary area then copied into pPage->aData 006992 ** later. 006993 */ 006994 static int fillInCell( 006995 MemPage *pPage, /* The page that contains the cell */ 006996 unsigned char *pCell, /* Complete text of the cell */ 006997 const BtreePayload *pX, /* Payload with which to construct the cell */ 006998 int *pnSize /* Write cell size here */ 006999 ){ 007000 int nPayload; 007001 const u8 *pSrc; 007002 int nSrc, n, rc, mn; 007003 int spaceLeft; 007004 MemPage *pToRelease; 007005 unsigned char *pPrior; 007006 unsigned char *pPayload; 007007 BtShared *pBt; 007008 Pgno pgnoOvfl; 007009 int nHeader; 007010 007011 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007012 007013 /* pPage is not necessarily writeable since pCell might be auxiliary 007014 ** buffer space that is separate from the pPage buffer area */ 007015 assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize] 007016 || sqlite3PagerIswriteable(pPage->pDbPage) ); 007017 007018 /* Fill in the header. */ 007019 nHeader = pPage->childPtrSize; 007020 if( pPage->intKey ){ 007021 nPayload = pX->nData + pX->nZero; 007022 pSrc = pX->pData; 007023 nSrc = pX->nData; 007024 assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */ 007025 nHeader += putVarint32(&pCell[nHeader], nPayload); 007026 nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey); 007027 }else{ 007028 assert( pX->nKey<=0x7fffffff && pX->pKey!=0 ); 007029 nSrc = nPayload = (int)pX->nKey; 007030 pSrc = pX->pKey; 007031 nHeader += putVarint32(&pCell[nHeader], nPayload); 007032 } 007033 007034 /* Fill in the payload */ 007035 pPayload = &pCell[nHeader]; 007036 if( nPayload<=pPage->maxLocal ){ 007037 /* This is the common case where everything fits on the btree page 007038 ** and no overflow pages are required. */ 007039 n = nHeader + nPayload; 007040 testcase( n==3 ); 007041 testcase( n==4 ); 007042 if( n<4 ){ 007043 n = 4; 007044 pPayload[nPayload] = 0; 007045 } 007046 *pnSize = n; 007047 assert( nSrc<=nPayload ); 007048 testcase( nSrc<nPayload ); 007049 memcpy(pPayload, pSrc, nSrc); 007050 memset(pPayload+nSrc, 0, nPayload-nSrc); 007051 return SQLITE_OK; 007052 } 007053 007054 /* If we reach this point, it means that some of the content will need 007055 ** to spill onto overflow pages. 007056 */ 007057 mn = pPage->minLocal; 007058 n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4); 007059 testcase( n==pPage->maxLocal ); 007060 testcase( n==pPage->maxLocal+1 ); 007061 if( n > pPage->maxLocal ) n = mn; 007062 spaceLeft = n; 007063 *pnSize = n + nHeader + 4; 007064 pPrior = &pCell[nHeader+n]; 007065 pToRelease = 0; 007066 pgnoOvfl = 0; 007067 pBt = pPage->pBt; 007068 007069 /* At this point variables should be set as follows: 007070 ** 007071 ** nPayload Total payload size in bytes 007072 ** pPayload Begin writing payload here 007073 ** spaceLeft Space available at pPayload. If nPayload>spaceLeft, 007074 ** that means content must spill into overflow pages. 007075 ** *pnSize Size of the local cell (not counting overflow pages) 007076 ** pPrior Where to write the pgno of the first overflow page 007077 ** 007078 ** Use a call to btreeParseCellPtr() to verify that the values above 007079 ** were computed correctly. 007080 */ 007081 #ifdef SQLITE_DEBUG 007082 { 007083 CellInfo info; 007084 pPage->xParseCell(pPage, pCell, &info); 007085 assert( nHeader==(int)(info.pPayload - pCell) ); 007086 assert( info.nKey==pX->nKey ); 007087 assert( *pnSize == info.nSize ); 007088 assert( spaceLeft == info.nLocal ); 007089 } 007090 #endif 007091 007092 /* Write the payload into the local Cell and any extra into overflow pages */ 007093 while( 1 ){ 007094 n = nPayload; 007095 if( n>spaceLeft ) n = spaceLeft; 007096 007097 /* If pToRelease is not zero than pPayload points into the data area 007098 ** of pToRelease. Make sure pToRelease is still writeable. */ 007099 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 007100 007101 /* If pPayload is part of the data area of pPage, then make sure pPage 007102 ** is still writeable */ 007103 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize] 007104 || sqlite3PagerIswriteable(pPage->pDbPage) ); 007105 007106 if( nSrc>=n ){ 007107 memcpy(pPayload, pSrc, n); 007108 }else if( nSrc>0 ){ 007109 n = nSrc; 007110 memcpy(pPayload, pSrc, n); 007111 }else{ 007112 memset(pPayload, 0, n); 007113 } 007114 nPayload -= n; 007115 if( nPayload<=0 ) break; 007116 pPayload += n; 007117 pSrc += n; 007118 nSrc -= n; 007119 spaceLeft -= n; 007120 if( spaceLeft==0 ){ 007121 MemPage *pOvfl = 0; 007122 #ifndef SQLITE_OMIT_AUTOVACUUM 007123 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 007124 if( pBt->autoVacuum ){ 007125 do{ 007126 pgnoOvfl++; 007127 } while( 007128 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 007129 ); 007130 } 007131 #endif 007132 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); 007133 #ifndef SQLITE_OMIT_AUTOVACUUM 007134 /* If the database supports auto-vacuum, and the second or subsequent 007135 ** overflow page is being allocated, add an entry to the pointer-map 007136 ** for that page now. 007137 ** 007138 ** If this is the first overflow page, then write a partial entry 007139 ** to the pointer-map. If we write nothing to this pointer-map slot, 007140 ** then the optimistic overflow chain processing in clearCell() 007141 ** may misinterpret the uninitialized values and delete the 007142 ** wrong pages from the database. 007143 */ 007144 if( pBt->autoVacuum && rc==SQLITE_OK ){ 007145 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 007146 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc); 007147 if( rc ){ 007148 releasePage(pOvfl); 007149 } 007150 } 007151 #endif 007152 if( rc ){ 007153 releasePage(pToRelease); 007154 return rc; 007155 } 007156 007157 /* If pToRelease is not zero than pPrior points into the data area 007158 ** of pToRelease. Make sure pToRelease is still writeable. */ 007159 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 007160 007161 /* If pPrior is part of the data area of pPage, then make sure pPage 007162 ** is still writeable */ 007163 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize] 007164 || sqlite3PagerIswriteable(pPage->pDbPage) ); 007165 007166 put4byte(pPrior, pgnoOvfl); 007167 releasePage(pToRelease); 007168 pToRelease = pOvfl; 007169 pPrior = pOvfl->aData; 007170 put4byte(pPrior, 0); 007171 pPayload = &pOvfl->aData[4]; 007172 spaceLeft = pBt->usableSize - 4; 007173 } 007174 } 007175 releasePage(pToRelease); 007176 return SQLITE_OK; 007177 } 007178 007179 /* 007180 ** Remove the i-th cell from pPage. This routine effects pPage only. 007181 ** The cell content is not freed or deallocated. It is assumed that 007182 ** the cell content has been copied someplace else. This routine just 007183 ** removes the reference to the cell from pPage. 007184 ** 007185 ** "sz" must be the number of bytes in the cell. 007186 */ 007187 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ 007188 u32 pc; /* Offset to cell content of cell being deleted */ 007189 u8 *data; /* pPage->aData */ 007190 u8 *ptr; /* Used to move bytes around within data[] */ 007191 int rc; /* The return code */ 007192 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */ 007193 007194 if( *pRC ) return; 007195 assert( idx>=0 ); 007196 assert( idx<pPage->nCell ); 007197 assert( CORRUPT_DB || sz==cellSize(pPage, idx) ); 007198 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 007199 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007200 assert( pPage->nFree>=0 ); 007201 data = pPage->aData; 007202 ptr = &pPage->aCellIdx[2*idx]; 007203 assert( pPage->pBt->usableSize > (u32)(ptr-data) ); 007204 pc = get2byte(ptr); 007205 hdr = pPage->hdrOffset; 007206 testcase( pc==(u32)get2byte(&data[hdr+5]) ); 007207 testcase( pc+sz==pPage->pBt->usableSize ); 007208 if( pc+sz > pPage->pBt->usableSize ){ 007209 *pRC = SQLITE_CORRUPT_BKPT; 007210 return; 007211 } 007212 rc = freeSpace(pPage, pc, sz); 007213 if( rc ){ 007214 *pRC = rc; 007215 return; 007216 } 007217 pPage->nCell--; 007218 if( pPage->nCell==0 ){ 007219 memset(&data[hdr+1], 0, 4); 007220 data[hdr+7] = 0; 007221 put2byte(&data[hdr+5], pPage->pBt->usableSize); 007222 pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset 007223 - pPage->childPtrSize - 8; 007224 }else{ 007225 memmove(ptr, ptr+2, 2*(pPage->nCell - idx)); 007226 put2byte(&data[hdr+3], pPage->nCell); 007227 pPage->nFree += 2; 007228 } 007229 } 007230 007231 /* 007232 ** Insert a new cell on pPage at cell index "i". pCell points to the 007233 ** content of the cell. 007234 ** 007235 ** If the cell content will fit on the page, then put it there. If it 007236 ** will not fit, then make a copy of the cell content into pTemp if 007237 ** pTemp is not null. Regardless of pTemp, allocate a new entry 007238 ** in pPage->apOvfl[] and make it point to the cell content (either 007239 ** in pTemp or the original pCell) and also record its index. 007240 ** Allocating a new entry in pPage->aCell[] implies that 007241 ** pPage->nOverflow is incremented. 007242 ** 007243 ** The insertCellFast() routine below works exactly the same as 007244 ** insertCell() except that it lacks the pTemp and iChild parameters 007245 ** which are assumed zero. Other than that, the two routines are the 007246 ** same. 007247 ** 007248 ** Fixes or enhancements to this routine should be reflected in 007249 ** insertCellFast()! 007250 */ 007251 static int insertCell( 007252 MemPage *pPage, /* Page into which we are copying */ 007253 int i, /* New cell becomes the i-th cell of the page */ 007254 u8 *pCell, /* Content of the new cell */ 007255 int sz, /* Bytes of content in pCell */ 007256 u8 *pTemp, /* Temp storage space for pCell, if needed */ 007257 Pgno iChild /* If non-zero, replace first 4 bytes with this value */ 007258 ){ 007259 int idx = 0; /* Where to write new cell content in data[] */ 007260 int j; /* Loop counter */ 007261 u8 *data; /* The content of the whole page */ 007262 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */ 007263 007264 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 007265 assert( MX_CELL(pPage->pBt)<=10921 ); 007266 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); 007267 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) ); 007268 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) ); 007269 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007270 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB ); 007271 assert( pPage->nFree>=0 ); 007272 assert( iChild>0 ); 007273 if( pPage->nOverflow || sz+2>pPage->nFree ){ 007274 if( pTemp ){ 007275 memcpy(pTemp, pCell, sz); 007276 pCell = pTemp; 007277 } 007278 put4byte(pCell, iChild); 007279 j = pPage->nOverflow++; 007280 /* Comparison against ArraySize-1 since we hold back one extra slot 007281 ** as a contingency. In other words, never need more than 3 overflow 007282 ** slots but 4 are allocated, just to be safe. */ 007283 assert( j < ArraySize(pPage->apOvfl)-1 ); 007284 pPage->apOvfl[j] = pCell; 007285 pPage->aiOvfl[j] = (u16)i; 007286 007287 /* When multiple overflows occur, they are always sequential and in 007288 ** sorted order. This invariants arise because multiple overflows can 007289 ** only occur when inserting divider cells into the parent page during 007290 ** balancing, and the dividers are adjacent and sorted. 007291 */ 007292 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */ 007293 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */ 007294 }else{ 007295 int rc = sqlite3PagerWrite(pPage->pDbPage); 007296 if( NEVER(rc!=SQLITE_OK) ){ 007297 return rc; 007298 } 007299 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 007300 data = pPage->aData; 007301 assert( &data[pPage->cellOffset]==pPage->aCellIdx ); 007302 rc = allocateSpace(pPage, sz, &idx); 007303 if( rc ){ return rc; } 007304 /* The allocateSpace() routine guarantees the following properties 007305 ** if it returns successfully */ 007306 assert( idx >= 0 ); 007307 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB ); 007308 assert( idx+sz <= (int)pPage->pBt->usableSize ); 007309 pPage->nFree -= (u16)(2 + sz); 007310 /* In a corrupt database where an entry in the cell index section of 007311 ** a btree page has a value of 3 or less, the pCell value might point 007312 ** as many as 4 bytes in front of the start of the aData buffer for 007313 ** the source page. Make sure this does not cause problems by not 007314 ** reading the first 4 bytes */ 007315 memcpy(&data[idx+4], pCell+4, sz-4); 007316 put4byte(&data[idx], iChild); 007317 pIns = pPage->aCellIdx + i*2; 007318 memmove(pIns+2, pIns, 2*(pPage->nCell - i)); 007319 put2byte(pIns, idx); 007320 pPage->nCell++; 007321 /* increment the cell count */ 007322 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++; 007323 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB ); 007324 #ifndef SQLITE_OMIT_AUTOVACUUM 007325 if( pPage->pBt->autoVacuum ){ 007326 int rc2 = SQLITE_OK; 007327 /* The cell may contain a pointer to an overflow page. If so, write 007328 ** the entry for the overflow page into the pointer map. 007329 */ 007330 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2); 007331 if( rc2 ) return rc2; 007332 } 007333 #endif 007334 } 007335 return SQLITE_OK; 007336 } 007337 007338 /* 007339 ** This variant of insertCell() assumes that the pTemp and iChild 007340 ** parameters are both zero. Use this variant in sqlite3BtreeInsert() 007341 ** for performance improvement, and also so that this variant is only 007342 ** called from that one place, and is thus inlined, and thus runs must 007343 ** faster. 007344 ** 007345 ** Fixes or enhancements to this routine should be reflected into 007346 ** the insertCell() routine. 007347 */ 007348 static int insertCellFast( 007349 MemPage *pPage, /* Page into which we are copying */ 007350 int i, /* New cell becomes the i-th cell of the page */ 007351 u8 *pCell, /* Content of the new cell */ 007352 int sz /* Bytes of content in pCell */ 007353 ){ 007354 int idx = 0; /* Where to write new cell content in data[] */ 007355 int j; /* Loop counter */ 007356 u8 *data; /* The content of the whole page */ 007357 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */ 007358 007359 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 007360 assert( MX_CELL(pPage->pBt)<=10921 ); 007361 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); 007362 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) ); 007363 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) ); 007364 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007365 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB ); 007366 assert( pPage->nFree>=0 ); 007367 assert( pPage->nOverflow==0 ); 007368 if( sz+2>pPage->nFree ){ 007369 j = pPage->nOverflow++; 007370 /* Comparison against ArraySize-1 since we hold back one extra slot 007371 ** as a contingency. In other words, never need more than 3 overflow 007372 ** slots but 4 are allocated, just to be safe. */ 007373 assert( j < ArraySize(pPage->apOvfl)-1 ); 007374 pPage->apOvfl[j] = pCell; 007375 pPage->aiOvfl[j] = (u16)i; 007376 007377 /* When multiple overflows occur, they are always sequential and in 007378 ** sorted order. This invariants arise because multiple overflows can 007379 ** only occur when inserting divider cells into the parent page during 007380 ** balancing, and the dividers are adjacent and sorted. 007381 */ 007382 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */ 007383 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */ 007384 }else{ 007385 int rc = sqlite3PagerWrite(pPage->pDbPage); 007386 if( rc!=SQLITE_OK ){ 007387 return rc; 007388 } 007389 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 007390 data = pPage->aData; 007391 assert( &data[pPage->cellOffset]==pPage->aCellIdx ); 007392 rc = allocateSpace(pPage, sz, &idx); 007393 if( rc ){ return rc; } 007394 /* The allocateSpace() routine guarantees the following properties 007395 ** if it returns successfully */ 007396 assert( idx >= 0 ); 007397 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB ); 007398 assert( idx+sz <= (int)pPage->pBt->usableSize ); 007399 pPage->nFree -= (u16)(2 + sz); 007400 memcpy(&data[idx], pCell, sz); 007401 pIns = pPage->aCellIdx + i*2; 007402 memmove(pIns+2, pIns, 2*(pPage->nCell - i)); 007403 put2byte(pIns, idx); 007404 pPage->nCell++; 007405 /* increment the cell count */ 007406 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++; 007407 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB ); 007408 #ifndef SQLITE_OMIT_AUTOVACUUM 007409 if( pPage->pBt->autoVacuum ){ 007410 int rc2 = SQLITE_OK; 007411 /* The cell may contain a pointer to an overflow page. If so, write 007412 ** the entry for the overflow page into the pointer map. 007413 */ 007414 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2); 007415 if( rc2 ) return rc2; 007416 } 007417 #endif 007418 } 007419 return SQLITE_OK; 007420 } 007421 007422 /* 007423 ** The following parameters determine how many adjacent pages get involved 007424 ** in a balancing operation. NN is the number of neighbors on either side 007425 ** of the page that participate in the balancing operation. NB is the 007426 ** total number of pages that participate, including the target page and 007427 ** NN neighbors on either side. 007428 ** 007429 ** The minimum value of NN is 1 (of course). Increasing NN above 1 007430 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 007431 ** in exchange for a larger degradation in INSERT and UPDATE performance. 007432 ** The value of NN appears to give the best results overall. 007433 ** 007434 ** (Later:) The description above makes it seem as if these values are 007435 ** tunable - as if you could change them and recompile and it would all work. 007436 ** But that is unlikely. NB has been 3 since the inception of SQLite and 007437 ** we have never tested any other value. 007438 */ 007439 #define NN 1 /* Number of neighbors on either side of pPage */ 007440 #define NB 3 /* (NN*2+1): Total pages involved in the balance */ 007441 007442 /* 007443 ** A CellArray object contains a cache of pointers and sizes for a 007444 ** consecutive sequence of cells that might be held on multiple pages. 007445 ** 007446 ** The cells in this array are the divider cell or cells from the pParent 007447 ** page plus up to three child pages. There are a total of nCell cells. 007448 ** 007449 ** pRef is a pointer to one of the pages that contributes cells. This is 007450 ** used to access information such as MemPage.intKey and MemPage.pBt->pageSize 007451 ** which should be common to all pages that contribute cells to this array. 007452 ** 007453 ** apCell[] and szCell[] hold, respectively, pointers to the start of each 007454 ** cell and the size of each cell. Some of the apCell[] pointers might refer 007455 ** to overflow cells. In other words, some apCel[] pointers might not point 007456 ** to content area of the pages. 007457 ** 007458 ** A szCell[] of zero means the size of that cell has not yet been computed. 007459 ** 007460 ** The cells come from as many as four different pages: 007461 ** 007462 ** ----------- 007463 ** | Parent | 007464 ** ----------- 007465 ** / | \ 007466 ** / | \ 007467 ** --------- --------- --------- 007468 ** |Child-1| |Child-2| |Child-3| 007469 ** --------- --------- --------- 007470 ** 007471 ** The order of cells is in the array is for an index btree is: 007472 ** 007473 ** 1. All cells from Child-1 in order 007474 ** 2. The first divider cell from Parent 007475 ** 3. All cells from Child-2 in order 007476 ** 4. The second divider cell from Parent 007477 ** 5. All cells from Child-3 in order 007478 ** 007479 ** For a table-btree (with rowids) the items 2 and 4 are empty because 007480 ** content exists only in leaves and there are no divider cells. 007481 ** 007482 ** For an index btree, the apEnd[] array holds pointer to the end of page 007483 ** for Child-1, the Parent, Child-2, the Parent (again), and Child-3, 007484 ** respectively. The ixNx[] array holds the number of cells contained in 007485 ** each of these 5 stages, and all stages to the left. Hence: 007486 ** 007487 ** ixNx[0] = Number of cells in Child-1. 007488 ** ixNx[1] = Number of cells in Child-1 plus 1 for first divider. 007489 ** ixNx[2] = Number of cells in Child-1 and Child-2 + 1 for 1st divider. 007490 ** ixNx[3] = Number of cells in Child-1 and Child-2 + both divider cells 007491 ** ixNx[4] = Total number of cells. 007492 ** 007493 ** For a table-btree, the concept is similar, except only apEnd[0]..apEnd[2] 007494 ** are used and they point to the leaf pages only, and the ixNx value are: 007495 ** 007496 ** ixNx[0] = Number of cells in Child-1. 007497 ** ixNx[1] = Number of cells in Child-1 and Child-2. 007498 ** ixNx[2] = Total number of cells. 007499 ** 007500 ** Sometimes when deleting, a child page can have zero cells. In those 007501 ** cases, ixNx[] entries with higher indexes, and the corresponding apEnd[] 007502 ** entries, shift down. The end result is that each ixNx[] entry should 007503 ** be larger than the previous 007504 */ 007505 typedef struct CellArray CellArray; 007506 struct CellArray { 007507 int nCell; /* Number of cells in apCell[] */ 007508 MemPage *pRef; /* Reference page */ 007509 u8 **apCell; /* All cells begin balanced */ 007510 u16 *szCell; /* Local size of all cells in apCell[] */ 007511 u8 *apEnd[NB*2]; /* MemPage.aDataEnd values */ 007512 int ixNx[NB*2]; /* Index of at which we move to the next apEnd[] */ 007513 }; 007514 007515 /* 007516 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been 007517 ** computed. 007518 */ 007519 static void populateCellCache(CellArray *p, int idx, int N){ 007520 MemPage *pRef = p->pRef; 007521 u16 *szCell = p->szCell; 007522 assert( idx>=0 && idx+N<=p->nCell ); 007523 while( N>0 ){ 007524 assert( p->apCell[idx]!=0 ); 007525 if( szCell[idx]==0 ){ 007526 szCell[idx] = pRef->xCellSize(pRef, p->apCell[idx]); 007527 }else{ 007528 assert( CORRUPT_DB || 007529 szCell[idx]==pRef->xCellSize(pRef, p->apCell[idx]) ); 007530 } 007531 idx++; 007532 N--; 007533 } 007534 } 007535 007536 /* 007537 ** Return the size of the Nth element of the cell array 007538 */ 007539 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){ 007540 assert( N>=0 && N<p->nCell ); 007541 assert( p->szCell[N]==0 ); 007542 p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]); 007543 return p->szCell[N]; 007544 } 007545 static u16 cachedCellSize(CellArray *p, int N){ 007546 assert( N>=0 && N<p->nCell ); 007547 if( p->szCell[N] ) return p->szCell[N]; 007548 return computeCellSize(p, N); 007549 } 007550 007551 /* 007552 ** Array apCell[] contains pointers to nCell b-tree page cells. The 007553 ** szCell[] array contains the size in bytes of each cell. This function 007554 ** replaces the current contents of page pPg with the contents of the cell 007555 ** array. 007556 ** 007557 ** Some of the cells in apCell[] may currently be stored in pPg. This 007558 ** function works around problems caused by this by making a copy of any 007559 ** such cells before overwriting the page data. 007560 ** 007561 ** The MemPage.nFree field is invalidated by this function. It is the 007562 ** responsibility of the caller to set it correctly. 007563 */ 007564 static int rebuildPage( 007565 CellArray *pCArray, /* Content to be added to page pPg */ 007566 int iFirst, /* First cell in pCArray to use */ 007567 int nCell, /* Final number of cells on page */ 007568 MemPage *pPg /* The page to be reconstructed */ 007569 ){ 007570 const int hdr = pPg->hdrOffset; /* Offset of header on pPg */ 007571 u8 * const aData = pPg->aData; /* Pointer to data for pPg */ 007572 const int usableSize = pPg->pBt->usableSize; 007573 u8 * const pEnd = &aData[usableSize]; 007574 int i = iFirst; /* Which cell to copy from pCArray*/ 007575 u32 j; /* Start of cell content area */ 007576 int iEnd = i+nCell; /* Loop terminator */ 007577 u8 *pCellptr = pPg->aCellIdx; 007578 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); 007579 u8 *pData; 007580 int k; /* Current slot in pCArray->apEnd[] */ 007581 u8 *pSrcEnd; /* Current pCArray->apEnd[k] value */ 007582 007583 assert( nCell>0 ); 007584 assert( i<iEnd ); 007585 j = get2byte(&aData[hdr+5]); 007586 if( j>(u32)usableSize ){ j = 0; } 007587 memcpy(&pTmp[j], &aData[j], usableSize - j); 007588 007589 assert( pCArray->ixNx[NB*2-1]>i ); 007590 for(k=0; pCArray->ixNx[k]<=i; k++){} 007591 pSrcEnd = pCArray->apEnd[k]; 007592 007593 pData = pEnd; 007594 while( 1/*exit by break*/ ){ 007595 u8 *pCell = pCArray->apCell[i]; 007596 u16 sz = pCArray->szCell[i]; 007597 assert( sz>0 ); 007598 if( SQLITE_WITHIN(pCell,aData+j,pEnd) ){ 007599 if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT; 007600 pCell = &pTmp[pCell - aData]; 007601 }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd 007602 && (uptr)(pCell)<(uptr)pSrcEnd 007603 ){ 007604 return SQLITE_CORRUPT_BKPT; 007605 } 007606 007607 pData -= sz; 007608 put2byte(pCellptr, (pData - aData)); 007609 pCellptr += 2; 007610 if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT; 007611 memmove(pData, pCell, sz); 007612 assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB ); 007613 i++; 007614 if( i>=iEnd ) break; 007615 if( pCArray->ixNx[k]<=i ){ 007616 k++; 007617 pSrcEnd = pCArray->apEnd[k]; 007618 } 007619 } 007620 007621 /* The pPg->nFree field is now set incorrectly. The caller will fix it. */ 007622 pPg->nCell = nCell; 007623 pPg->nOverflow = 0; 007624 007625 put2byte(&aData[hdr+1], 0); 007626 put2byte(&aData[hdr+3], pPg->nCell); 007627 put2byte(&aData[hdr+5], pData - aData); 007628 aData[hdr+7] = 0x00; 007629 return SQLITE_OK; 007630 } 007631 007632 /* 007633 ** The pCArray objects contains pointers to b-tree cells and the cell sizes. 007634 ** This function attempts to add the cells stored in the array to page pPg. 007635 ** If it cannot (because the page needs to be defragmented before the cells 007636 ** will fit), non-zero is returned. Otherwise, if the cells are added 007637 ** successfully, zero is returned. 007638 ** 007639 ** Argument pCellptr points to the first entry in the cell-pointer array 007640 ** (part of page pPg) to populate. After cell apCell[0] is written to the 007641 ** page body, a 16-bit offset is written to pCellptr. And so on, for each 007642 ** cell in the array. It is the responsibility of the caller to ensure 007643 ** that it is safe to overwrite this part of the cell-pointer array. 007644 ** 007645 ** When this function is called, *ppData points to the start of the 007646 ** content area on page pPg. If the size of the content area is extended, 007647 ** *ppData is updated to point to the new start of the content area 007648 ** before returning. 007649 ** 007650 ** Finally, argument pBegin points to the byte immediately following the 007651 ** end of the space required by this page for the cell-pointer area (for 007652 ** all cells - not just those inserted by the current call). If the content 007653 ** area must be extended to before this point in order to accommodate all 007654 ** cells in apCell[], then the cells do not fit and non-zero is returned. 007655 */ 007656 static int pageInsertArray( 007657 MemPage *pPg, /* Page to add cells to */ 007658 u8 *pBegin, /* End of cell-pointer array */ 007659 u8 **ppData, /* IN/OUT: Page content-area pointer */ 007660 u8 *pCellptr, /* Pointer to cell-pointer area */ 007661 int iFirst, /* Index of first cell to add */ 007662 int nCell, /* Number of cells to add to pPg */ 007663 CellArray *pCArray /* Array of cells */ 007664 ){ 007665 int i = iFirst; /* Loop counter - cell index to insert */ 007666 u8 *aData = pPg->aData; /* Complete page */ 007667 u8 *pData = *ppData; /* Content area. A subset of aData[] */ 007668 int iEnd = iFirst + nCell; /* End of loop. One past last cell to ins */ 007669 int k; /* Current slot in pCArray->apEnd[] */ 007670 u8 *pEnd; /* Maximum extent of cell data */ 007671 assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */ 007672 if( iEnd<=iFirst ) return 0; 007673 assert( pCArray->ixNx[NB*2-1]>i ); 007674 for(k=0; pCArray->ixNx[k]<=i ; k++){} 007675 pEnd = pCArray->apEnd[k]; 007676 while( 1 /*Exit by break*/ ){ 007677 int sz, rc; 007678 u8 *pSlot; 007679 assert( pCArray->szCell[i]!=0 ); 007680 sz = pCArray->szCell[i]; 007681 if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){ 007682 if( (pData - pBegin)<sz ) return 1; 007683 pData -= sz; 007684 pSlot = pData; 007685 } 007686 /* pSlot and pCArray->apCell[i] will never overlap on a well-formed 007687 ** database. But they might for a corrupt database. Hence use memmove() 007688 ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */ 007689 assert( (pSlot+sz)<=pCArray->apCell[i] 007690 || pSlot>=(pCArray->apCell[i]+sz) 007691 || CORRUPT_DB ); 007692 if( (uptr)(pCArray->apCell[i]+sz)>(uptr)pEnd 007693 && (uptr)(pCArray->apCell[i])<(uptr)pEnd 007694 ){ 007695 assert( CORRUPT_DB ); 007696 (void)SQLITE_CORRUPT_BKPT; 007697 return 1; 007698 } 007699 memmove(pSlot, pCArray->apCell[i], sz); 007700 put2byte(pCellptr, (pSlot - aData)); 007701 pCellptr += 2; 007702 i++; 007703 if( i>=iEnd ) break; 007704 if( pCArray->ixNx[k]<=i ){ 007705 k++; 007706 pEnd = pCArray->apEnd[k]; 007707 } 007708 } 007709 *ppData = pData; 007710 return 0; 007711 } 007712 007713 /* 007714 ** The pCArray object contains pointers to b-tree cells and their sizes. 007715 ** 007716 ** This function adds the space associated with each cell in the array 007717 ** that is currently stored within the body of pPg to the pPg free-list. 007718 ** The cell-pointers and other fields of the page are not updated. 007719 ** 007720 ** This function returns the total number of cells added to the free-list. 007721 */ 007722 static int pageFreeArray( 007723 MemPage *pPg, /* Page to edit */ 007724 int iFirst, /* First cell to delete */ 007725 int nCell, /* Cells to delete */ 007726 CellArray *pCArray /* Array of cells */ 007727 ){ 007728 u8 * const aData = pPg->aData; 007729 u8 * const pEnd = &aData[pPg->pBt->usableSize]; 007730 u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize]; 007731 int nRet = 0; 007732 int i, j; 007733 int iEnd = iFirst + nCell; 007734 int nFree = 0; 007735 int aOfst[10]; 007736 int aAfter[10]; 007737 007738 for(i=iFirst; i<iEnd; i++){ 007739 u8 *pCell = pCArray->apCell[i]; 007740 if( SQLITE_WITHIN(pCell, pStart, pEnd) ){ 007741 int sz; 007742 int iAfter; 007743 int iOfst; 007744 /* No need to use cachedCellSize() here. The sizes of all cells that 007745 ** are to be freed have already been computing while deciding which 007746 ** cells need freeing */ 007747 sz = pCArray->szCell[i]; assert( sz>0 ); 007748 iOfst = (u16)(pCell - aData); 007749 iAfter = iOfst+sz; 007750 for(j=0; j<nFree; j++){ 007751 if( aOfst[j]==iAfter ){ 007752 aOfst[j] = iOfst; 007753 break; 007754 }else if( aAfter[j]==iOfst ){ 007755 aAfter[j] = iAfter; 007756 break; 007757 } 007758 } 007759 if( j>=nFree ){ 007760 if( nFree>=(int)(sizeof(aOfst)/sizeof(aOfst[0])) ){ 007761 for(j=0; j<nFree; j++){ 007762 freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]); 007763 } 007764 nFree = 0; 007765 } 007766 aOfst[nFree] = iOfst; 007767 aAfter[nFree] = iAfter; 007768 if( &aData[iAfter]>pEnd ) return 0; 007769 nFree++; 007770 } 007771 nRet++; 007772 } 007773 } 007774 for(j=0; j<nFree; j++){ 007775 freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]); 007776 } 007777 return nRet; 007778 } 007779 007780 /* 007781 ** pCArray contains pointers to and sizes of all cells in the page being 007782 ** balanced. The current page, pPg, has pPg->nCell cells starting with 007783 ** pCArray->apCell[iOld]. After balancing, this page should hold nNew cells 007784 ** starting at apCell[iNew]. 007785 ** 007786 ** This routine makes the necessary adjustments to pPg so that it contains 007787 ** the correct cells after being balanced. 007788 ** 007789 ** The pPg->nFree field is invalid when this function returns. It is the 007790 ** responsibility of the caller to set it correctly. 007791 */ 007792 static int editPage( 007793 MemPage *pPg, /* Edit this page */ 007794 int iOld, /* Index of first cell currently on page */ 007795 int iNew, /* Index of new first cell on page */ 007796 int nNew, /* Final number of cells on page */ 007797 CellArray *pCArray /* Array of cells and sizes */ 007798 ){ 007799 u8 * const aData = pPg->aData; 007800 const int hdr = pPg->hdrOffset; 007801 u8 *pBegin = &pPg->aCellIdx[nNew * 2]; 007802 int nCell = pPg->nCell; /* Cells stored on pPg */ 007803 u8 *pData; 007804 u8 *pCellptr; 007805 int i; 007806 int iOldEnd = iOld + pPg->nCell + pPg->nOverflow; 007807 int iNewEnd = iNew + nNew; 007808 007809 #ifdef SQLITE_DEBUG 007810 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); 007811 memcpy(pTmp, aData, pPg->pBt->usableSize); 007812 #endif 007813 007814 /* Remove cells from the start and end of the page */ 007815 assert( nCell>=0 ); 007816 if( iOld<iNew ){ 007817 int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray); 007818 if( NEVER(nShift>nCell) ) return SQLITE_CORRUPT_BKPT; 007819 memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2); 007820 nCell -= nShift; 007821 } 007822 if( iNewEnd < iOldEnd ){ 007823 int nTail = pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray); 007824 assert( nCell>=nTail ); 007825 nCell -= nTail; 007826 } 007827 007828 pData = &aData[get2byte(&aData[hdr+5])]; 007829 if( pData<pBegin ) goto editpage_fail; 007830 if( NEVER(pData>pPg->aDataEnd) ) goto editpage_fail; 007831 007832 /* Add cells to the start of the page */ 007833 if( iNew<iOld ){ 007834 int nAdd = MIN(nNew,iOld-iNew); 007835 assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB ); 007836 assert( nAdd>=0 ); 007837 pCellptr = pPg->aCellIdx; 007838 memmove(&pCellptr[nAdd*2], pCellptr, nCell*2); 007839 if( pageInsertArray( 007840 pPg, pBegin, &pData, pCellptr, 007841 iNew, nAdd, pCArray 007842 ) ) goto editpage_fail; 007843 nCell += nAdd; 007844 } 007845 007846 /* Add any overflow cells */ 007847 for(i=0; i<pPg->nOverflow; i++){ 007848 int iCell = (iOld + pPg->aiOvfl[i]) - iNew; 007849 if( iCell>=0 && iCell<nNew ){ 007850 pCellptr = &pPg->aCellIdx[iCell * 2]; 007851 if( nCell>iCell ){ 007852 memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2); 007853 } 007854 nCell++; 007855 cachedCellSize(pCArray, iCell+iNew); 007856 if( pageInsertArray( 007857 pPg, pBegin, &pData, pCellptr, 007858 iCell+iNew, 1, pCArray 007859 ) ) goto editpage_fail; 007860 } 007861 } 007862 007863 /* Append cells to the end of the page */ 007864 assert( nCell>=0 ); 007865 pCellptr = &pPg->aCellIdx[nCell*2]; 007866 if( pageInsertArray( 007867 pPg, pBegin, &pData, pCellptr, 007868 iNew+nCell, nNew-nCell, pCArray 007869 ) ) goto editpage_fail; 007870 007871 pPg->nCell = nNew; 007872 pPg->nOverflow = 0; 007873 007874 put2byte(&aData[hdr+3], pPg->nCell); 007875 put2byte(&aData[hdr+5], pData - aData); 007876 007877 #ifdef SQLITE_DEBUG 007878 for(i=0; i<nNew && !CORRUPT_DB; i++){ 007879 u8 *pCell = pCArray->apCell[i+iNew]; 007880 int iOff = get2byteAligned(&pPg->aCellIdx[i*2]); 007881 if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){ 007882 pCell = &pTmp[pCell - aData]; 007883 } 007884 assert( 0==memcmp(pCell, &aData[iOff], 007885 pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) ); 007886 } 007887 #endif 007888 007889 return SQLITE_OK; 007890 editpage_fail: 007891 /* Unable to edit this page. Rebuild it from scratch instead. */ 007892 if( nNew<1 ) return SQLITE_CORRUPT_BKPT; 007893 populateCellCache(pCArray, iNew, nNew); 007894 return rebuildPage(pCArray, iNew, nNew, pPg); 007895 } 007896 007897 007898 #ifndef SQLITE_OMIT_QUICKBALANCE 007899 /* 007900 ** This version of balance() handles the common special case where 007901 ** a new entry is being inserted on the extreme right-end of the 007902 ** tree, in other words, when the new entry will become the largest 007903 ** entry in the tree. 007904 ** 007905 ** Instead of trying to balance the 3 right-most leaf pages, just add 007906 ** a new page to the right-hand side and put the one new entry in 007907 ** that page. This leaves the right side of the tree somewhat 007908 ** unbalanced. But odds are that we will be inserting new entries 007909 ** at the end soon afterwards so the nearly empty page will quickly 007910 ** fill up. On average. 007911 ** 007912 ** pPage is the leaf page which is the right-most page in the tree. 007913 ** pParent is its parent. pPage must have a single overflow entry 007914 ** which is also the right-most entry on the page. 007915 ** 007916 ** The pSpace buffer is used to store a temporary copy of the divider 007917 ** cell that will be inserted into pParent. Such a cell consists of a 4 007918 ** byte page number followed by a variable length integer. In other 007919 ** words, at most 13 bytes. Hence the pSpace buffer must be at 007920 ** least 13 bytes in size. 007921 */ 007922 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ 007923 BtShared *const pBt = pPage->pBt; /* B-Tree Database */ 007924 MemPage *pNew; /* Newly allocated page */ 007925 int rc; /* Return Code */ 007926 Pgno pgnoNew; /* Page number of pNew */ 007927 007928 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007929 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 007930 assert( pPage->nOverflow==1 ); 007931 007932 if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT; /* dbfuzz001.test */ 007933 assert( pPage->nFree>=0 ); 007934 assert( pParent->nFree>=0 ); 007935 007936 /* Allocate a new page. This page will become the right-sibling of 007937 ** pPage. Make the parent page writable, so that the new divider cell 007938 ** may be inserted. If both these operations are successful, proceed. 007939 */ 007940 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 007941 007942 if( rc==SQLITE_OK ){ 007943 007944 u8 *pOut = &pSpace[4]; 007945 u8 *pCell = pPage->apOvfl[0]; 007946 u16 szCell = pPage->xCellSize(pPage, pCell); 007947 u8 *pStop; 007948 CellArray b; 007949 007950 assert( sqlite3PagerIswriteable(pNew->pDbPage) ); 007951 assert( CORRUPT_DB || pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); 007952 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); 007953 b.nCell = 1; 007954 b.pRef = pPage; 007955 b.apCell = &pCell; 007956 b.szCell = &szCell; 007957 b.apEnd[0] = pPage->aDataEnd; 007958 b.ixNx[0] = 2; 007959 b.ixNx[NB*2-1] = 0x7fffffff; 007960 rc = rebuildPage(&b, 0, 1, pNew); 007961 if( NEVER(rc) ){ 007962 releasePage(pNew); 007963 return rc; 007964 } 007965 pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell; 007966 007967 /* If this is an auto-vacuum database, update the pointer map 007968 ** with entries for the new page, and any pointer from the 007969 ** cell on the page to an overflow page. If either of these 007970 ** operations fails, the return code is set, but the contents 007971 ** of the parent page are still manipulated by the code below. 007972 ** That is Ok, at this point the parent page is guaranteed to 007973 ** be marked as dirty. Returning an error code will cause a 007974 ** rollback, undoing any changes made to the parent page. 007975 */ 007976 if( ISAUTOVACUUM(pBt) ){ 007977 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc); 007978 if( szCell>pNew->minLocal ){ 007979 ptrmapPutOvflPtr(pNew, pNew, pCell, &rc); 007980 } 007981 } 007982 007983 /* Create a divider cell to insert into pParent. The divider cell 007984 ** consists of a 4-byte page number (the page number of pPage) and 007985 ** a variable length key value (which must be the same value as the 007986 ** largest key on pPage). 007987 ** 007988 ** To find the largest key value on pPage, first find the right-most 007989 ** cell on pPage. The first two fields of this cell are the 007990 ** record-length (a variable length integer at most 32-bits in size) 007991 ** and the key value (a variable length integer, may have any value). 007992 ** The first of the while(...) loops below skips over the record-length 007993 ** field. The second while(...) loop copies the key value from the 007994 ** cell on pPage into the pSpace buffer. 007995 */ 007996 pCell = findCell(pPage, pPage->nCell-1); 007997 pStop = &pCell[9]; 007998 while( (*(pCell++)&0x80) && pCell<pStop ); 007999 pStop = &pCell[9]; 008000 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); 008001 008002 /* Insert the new divider cell into pParent. */ 008003 if( rc==SQLITE_OK ){ 008004 rc = insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), 008005 0, pPage->pgno); 008006 } 008007 008008 /* Set the right-child pointer of pParent to point to the new page. */ 008009 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 008010 008011 /* Release the reference to the new page. */ 008012 releasePage(pNew); 008013 } 008014 008015 return rc; 008016 } 008017 #endif /* SQLITE_OMIT_QUICKBALANCE */ 008018 008019 #if 0 008020 /* 008021 ** This function does not contribute anything to the operation of SQLite. 008022 ** it is sometimes activated temporarily while debugging code responsible 008023 ** for setting pointer-map entries. 008024 */ 008025 static int ptrmapCheckPages(MemPage **apPage, int nPage){ 008026 int i, j; 008027 for(i=0; i<nPage; i++){ 008028 Pgno n; 008029 u8 e; 008030 MemPage *pPage = apPage[i]; 008031 BtShared *pBt = pPage->pBt; 008032 assert( pPage->isInit ); 008033 008034 for(j=0; j<pPage->nCell; j++){ 008035 CellInfo info; 008036 u8 *z; 008037 008038 z = findCell(pPage, j); 008039 pPage->xParseCell(pPage, z, &info); 008040 if( info.nLocal<info.nPayload ){ 008041 Pgno ovfl = get4byte(&z[info.nSize-4]); 008042 ptrmapGet(pBt, ovfl, &e, &n); 008043 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 ); 008044 } 008045 if( !pPage->leaf ){ 008046 Pgno child = get4byte(z); 008047 ptrmapGet(pBt, child, &e, &n); 008048 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 008049 } 008050 } 008051 if( !pPage->leaf ){ 008052 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]); 008053 ptrmapGet(pBt, child, &e, &n); 008054 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 008055 } 008056 } 008057 return 1; 008058 } 008059 #endif 008060 008061 /* 008062 ** This function is used to copy the contents of the b-tree node stored 008063 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then 008064 ** the pointer-map entries for each child page are updated so that the 008065 ** parent page stored in the pointer map is page pTo. If pFrom contained 008066 ** any cells with overflow page pointers, then the corresponding pointer 008067 ** map entries are also updated so that the parent page is page pTo. 008068 ** 008069 ** If pFrom is currently carrying any overflow cells (entries in the 008070 ** MemPage.apOvfl[] array), they are not copied to pTo. 008071 ** 008072 ** Before returning, page pTo is reinitialized using btreeInitPage(). 008073 ** 008074 ** The performance of this function is not critical. It is only used by 008075 ** the balance_shallower() and balance_deeper() procedures, neither of 008076 ** which are called often under normal circumstances. 008077 */ 008078 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ 008079 if( (*pRC)==SQLITE_OK ){ 008080 BtShared * const pBt = pFrom->pBt; 008081 u8 * const aFrom = pFrom->aData; 008082 u8 * const aTo = pTo->aData; 008083 int const iFromHdr = pFrom->hdrOffset; 008084 int const iToHdr = ((pTo->pgno==1) ? 100 : 0); 008085 int rc; 008086 int iData; 008087 008088 008089 assert( pFrom->isInit ); 008090 assert( pFrom->nFree>=iToHdr ); 008091 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize ); 008092 008093 /* Copy the b-tree node content from page pFrom to page pTo. */ 008094 iData = get2byte(&aFrom[iFromHdr+5]); 008095 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); 008096 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); 008097 008098 /* Reinitialize page pTo so that the contents of the MemPage structure 008099 ** match the new data. The initialization of pTo can actually fail under 008100 ** fairly obscure circumstances, even though it is a copy of initialized 008101 ** page pFrom. 008102 */ 008103 pTo->isInit = 0; 008104 rc = btreeInitPage(pTo); 008105 if( rc==SQLITE_OK ) rc = btreeComputeFreeSpace(pTo); 008106 if( rc!=SQLITE_OK ){ 008107 *pRC = rc; 008108 return; 008109 } 008110 008111 /* If this is an auto-vacuum database, update the pointer-map entries 008112 ** for any b-tree or overflow pages that pTo now contains the pointers to. 008113 */ 008114 if( ISAUTOVACUUM(pBt) ){ 008115 *pRC = setChildPtrmaps(pTo); 008116 } 008117 } 008118 } 008119 008120 /* 008121 ** This routine redistributes cells on the iParentIdx'th child of pParent 008122 ** (hereafter "the page") and up to 2 siblings so that all pages have about the 008123 ** same amount of free space. Usually a single sibling on either side of the 008124 ** page are used in the balancing, though both siblings might come from one 008125 ** side if the page is the first or last child of its parent. If the page 008126 ** has fewer than 2 siblings (something which can only happen if the page 008127 ** is a root page or a child of a root page) then all available siblings 008128 ** participate in the balancing. 008129 ** 008130 ** The number of siblings of the page might be increased or decreased by 008131 ** one or two in an effort to keep pages nearly full but not over full. 008132 ** 008133 ** Note that when this routine is called, some of the cells on the page 008134 ** might not actually be stored in MemPage.aData[]. This can happen 008135 ** if the page is overfull. This routine ensures that all cells allocated 008136 ** to the page and its siblings fit into MemPage.aData[] before returning. 008137 ** 008138 ** In the course of balancing the page and its siblings, cells may be 008139 ** inserted into or removed from the parent page (pParent). Doing so 008140 ** may cause the parent page to become overfull or underfull. If this 008141 ** happens, it is the responsibility of the caller to invoke the correct 008142 ** balancing routine to fix this problem (see the balance() routine). 008143 ** 008144 ** If this routine fails for any reason, it might leave the database 008145 ** in a corrupted state. So if this routine fails, the database should 008146 ** be rolled back. 008147 ** 008148 ** The third argument to this function, aOvflSpace, is a pointer to a 008149 ** buffer big enough to hold one page. If while inserting cells into the parent 008150 ** page (pParent) the parent page becomes overfull, this buffer is 008151 ** used to store the parent's overflow cells. Because this function inserts 008152 ** a maximum of four divider cells into the parent page, and the maximum 008153 ** size of a cell stored within an internal node is always less than 1/4 008154 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large 008155 ** enough for all overflow cells. 008156 ** 008157 ** If aOvflSpace is set to a null pointer, this function returns 008158 ** SQLITE_NOMEM. 008159 */ 008160 static int balance_nonroot( 008161 MemPage *pParent, /* Parent page of siblings being balanced */ 008162 int iParentIdx, /* Index of "the page" in pParent */ 008163 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ 008164 int isRoot, /* True if pParent is a root-page */ 008165 int bBulk /* True if this call is part of a bulk load */ 008166 ){ 008167 BtShared *pBt; /* The whole database */ 008168 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 008169 int nNew = 0; /* Number of pages in apNew[] */ 008170 int nOld; /* Number of pages in apOld[] */ 008171 int i, j, k; /* Loop counters */ 008172 int nxDiv; /* Next divider slot in pParent->aCell[] */ 008173 int rc = SQLITE_OK; /* The return code */ 008174 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 008175 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 008176 int usableSpace; /* Bytes in pPage beyond the header */ 008177 int pageFlags; /* Value of pPage->aData[0] */ 008178 int iSpace1 = 0; /* First unused byte of aSpace1[] */ 008179 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ 008180 int szScratch; /* Size of scratch memory requested */ 008181 MemPage *apOld[NB]; /* pPage and up to two siblings */ 008182 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 008183 u8 *pRight; /* Location in parent of right-sibling pointer */ 008184 u8 *apDiv[NB-1]; /* Divider cells in pParent */ 008185 int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */ 008186 int cntOld[NB+2]; /* Old index in b.apCell[] */ 008187 int szNew[NB+2]; /* Combined size of cells placed on i-th page */ 008188 u8 *aSpace1; /* Space for copies of dividers cells */ 008189 Pgno pgno; /* Temp var to store a page number in */ 008190 u8 abDone[NB+2]; /* True after i'th new page is populated */ 008191 Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */ 008192 CellArray b; /* Parsed information on cells being balanced */ 008193 008194 memset(abDone, 0, sizeof(abDone)); 008195 assert( sizeof(b) - sizeof(b.ixNx) == offsetof(CellArray,ixNx) ); 008196 memset(&b, 0, sizeof(b)-sizeof(b.ixNx[0])); 008197 b.ixNx[NB*2-1] = 0x7fffffff; 008198 pBt = pParent->pBt; 008199 assert( sqlite3_mutex_held(pBt->mutex) ); 008200 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008201 008202 /* At this point pParent may have at most one overflow cell. And if 008203 ** this overflow cell is present, it must be the cell with 008204 ** index iParentIdx. This scenario comes about when this function 008205 ** is called (indirectly) from sqlite3BtreeDelete(). 008206 */ 008207 assert( pParent->nOverflow==0 || pParent->nOverflow==1 ); 008208 assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx ); 008209 008210 if( !aOvflSpace ){ 008211 return SQLITE_NOMEM_BKPT; 008212 } 008213 assert( pParent->nFree>=0 ); 008214 008215 /* Find the sibling pages to balance. Also locate the cells in pParent 008216 ** that divide the siblings. An attempt is made to find NN siblings on 008217 ** either side of pPage. More siblings are taken from one side, however, 008218 ** if there are fewer than NN siblings on the other side. If pParent 008219 ** has NB or fewer children then all children of pParent are taken. 008220 ** 008221 ** This loop also drops the divider cells from the parent page. This 008222 ** way, the remainder of the function does not have to deal with any 008223 ** overflow cells in the parent page, since if any existed they will 008224 ** have already been removed. 008225 */ 008226 i = pParent->nOverflow + pParent->nCell; 008227 if( i<2 ){ 008228 nxDiv = 0; 008229 }else{ 008230 assert( bBulk==0 || bBulk==1 ); 008231 if( iParentIdx==0 ){ 008232 nxDiv = 0; 008233 }else if( iParentIdx==i ){ 008234 nxDiv = i-2+bBulk; 008235 }else{ 008236 nxDiv = iParentIdx-1; 008237 } 008238 i = 2-bBulk; 008239 } 008240 nOld = i+1; 008241 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){ 008242 pRight = &pParent->aData[pParent->hdrOffset+8]; 008243 }else{ 008244 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow); 008245 } 008246 pgno = get4byte(pRight); 008247 while( 1 ){ 008248 if( rc==SQLITE_OK ){ 008249 rc = getAndInitPage(pBt, pgno, &apOld[i], 0); 008250 } 008251 if( rc ){ 008252 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 008253 goto balance_cleanup; 008254 } 008255 if( apOld[i]->nFree<0 ){ 008256 rc = btreeComputeFreeSpace(apOld[i]); 008257 if( rc ){ 008258 memset(apOld, 0, (i)*sizeof(MemPage*)); 008259 goto balance_cleanup; 008260 } 008261 } 008262 nMaxCells += apOld[i]->nCell + ArraySize(pParent->apOvfl); 008263 if( (i--)==0 ) break; 008264 008265 if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){ 008266 apDiv[i] = pParent->apOvfl[0]; 008267 pgno = get4byte(apDiv[i]); 008268 szNew[i] = pParent->xCellSize(pParent, apDiv[i]); 008269 pParent->nOverflow = 0; 008270 }else{ 008271 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); 008272 pgno = get4byte(apDiv[i]); 008273 szNew[i] = pParent->xCellSize(pParent, apDiv[i]); 008274 008275 /* Drop the cell from the parent page. apDiv[i] still points to 008276 ** the cell within the parent, even though it has been dropped. 008277 ** This is safe because dropping a cell only overwrites the first 008278 ** four bytes of it, and this function does not need the first 008279 ** four bytes of the divider cell. So the pointer is safe to use 008280 ** later on. 008281 ** 008282 ** But not if we are in secure-delete mode. In secure-delete mode, 008283 ** the dropCell() routine will overwrite the entire cell with zeroes. 008284 ** In this case, temporarily copy the cell into the aOvflSpace[] 008285 ** buffer. It will be copied out again as soon as the aSpace[] buffer 008286 ** is allocated. */ 008287 if( pBt->btsFlags & BTS_FAST_SECURE ){ 008288 int iOff; 008289 008290 /* If the following if() condition is not true, the db is corrupted. 008291 ** The call to dropCell() below will detect this. */ 008292 iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData); 008293 if( (iOff+szNew[i])<=(int)pBt->usableSize ){ 008294 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]); 008295 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData]; 008296 } 008297 } 008298 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc); 008299 } 008300 } 008301 008302 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte 008303 ** alignment */ 008304 nMaxCells = (nMaxCells + 3)&~3; 008305 008306 /* 008307 ** Allocate space for memory structures 008308 */ 008309 szScratch = 008310 nMaxCells*sizeof(u8*) /* b.apCell */ 008311 + nMaxCells*sizeof(u16) /* b.szCell */ 008312 + pBt->pageSize; /* aSpace1 */ 008313 008314 assert( szScratch<=7*(int)pBt->pageSize ); 008315 b.apCell = sqlite3StackAllocRaw(0, szScratch ); 008316 if( b.apCell==0 ){ 008317 rc = SQLITE_NOMEM_BKPT; 008318 goto balance_cleanup; 008319 } 008320 b.szCell = (u16*)&b.apCell[nMaxCells]; 008321 aSpace1 = (u8*)&b.szCell[nMaxCells]; 008322 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); 008323 008324 /* 008325 ** Load pointers to all cells on sibling pages and the divider cells 008326 ** into the local b.apCell[] array. Make copies of the divider cells 008327 ** into space obtained from aSpace1[]. The divider cells have already 008328 ** been removed from pParent. 008329 ** 008330 ** If the siblings are on leaf pages, then the child pointers of the 008331 ** divider cells are stripped from the cells before they are copied 008332 ** into aSpace1[]. In this way, all cells in b.apCell[] are without 008333 ** child pointers. If siblings are not leaves, then all cell in 008334 ** b.apCell[] include child pointers. Either way, all cells in b.apCell[] 008335 ** are alike. 008336 ** 008337 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 008338 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 008339 */ 008340 b.pRef = apOld[0]; 008341 leafCorrection = b.pRef->leaf*4; 008342 leafData = b.pRef->intKeyLeaf; 008343 for(i=0; i<nOld; i++){ 008344 MemPage *pOld = apOld[i]; 008345 int limit = pOld->nCell; 008346 u8 *aData = pOld->aData; 008347 u16 maskPage = pOld->maskPage; 008348 u8 *piCell = aData + pOld->cellOffset; 008349 u8 *piEnd; 008350 VVA_ONLY( int nCellAtStart = b.nCell; ) 008351 008352 /* Verify that all sibling pages are of the same "type" (table-leaf, 008353 ** table-interior, index-leaf, or index-interior). 008354 */ 008355 if( pOld->aData[0]!=apOld[0]->aData[0] ){ 008356 rc = SQLITE_CORRUPT_PAGE(pOld); 008357 goto balance_cleanup; 008358 } 008359 008360 /* Load b.apCell[] with pointers to all cells in pOld. If pOld 008361 ** contains overflow cells, include them in the b.apCell[] array 008362 ** in the correct spot. 008363 ** 008364 ** Note that when there are multiple overflow cells, it is always the 008365 ** case that they are sequential and adjacent. This invariant arises 008366 ** because multiple overflows can only occurs when inserting divider 008367 ** cells into a parent on a prior balance, and divider cells are always 008368 ** adjacent and are inserted in order. There is an assert() tagged 008369 ** with "NOTE 1" in the overflow cell insertion loop to prove this 008370 ** invariant. 008371 ** 008372 ** This must be done in advance. Once the balance starts, the cell 008373 ** offset section of the btree page will be overwritten and we will no 008374 ** long be able to find the cells if a pointer to each cell is not saved 008375 ** first. 008376 */ 008377 memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow)); 008378 if( pOld->nOverflow>0 ){ 008379 if( NEVER(limit<pOld->aiOvfl[0]) ){ 008380 rc = SQLITE_CORRUPT_PAGE(pOld); 008381 goto balance_cleanup; 008382 } 008383 limit = pOld->aiOvfl[0]; 008384 for(j=0; j<limit; j++){ 008385 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell)); 008386 piCell += 2; 008387 b.nCell++; 008388 } 008389 for(k=0; k<pOld->nOverflow; k++){ 008390 assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */ 008391 b.apCell[b.nCell] = pOld->apOvfl[k]; 008392 b.nCell++; 008393 } 008394 } 008395 piEnd = aData + pOld->cellOffset + 2*pOld->nCell; 008396 while( piCell<piEnd ){ 008397 assert( b.nCell<nMaxCells ); 008398 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell)); 008399 piCell += 2; 008400 b.nCell++; 008401 } 008402 assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) ); 008403 008404 cntOld[i] = b.nCell; 008405 if( i<nOld-1 && !leafData){ 008406 u16 sz = (u16)szNew[i]; 008407 u8 *pTemp; 008408 assert( b.nCell<nMaxCells ); 008409 b.szCell[b.nCell] = sz; 008410 pTemp = &aSpace1[iSpace1]; 008411 iSpace1 += sz; 008412 assert( sz<=pBt->maxLocal+23 ); 008413 assert( iSpace1 <= (int)pBt->pageSize ); 008414 memcpy(pTemp, apDiv[i], sz); 008415 b.apCell[b.nCell] = pTemp+leafCorrection; 008416 assert( leafCorrection==0 || leafCorrection==4 ); 008417 b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection; 008418 if( !pOld->leaf ){ 008419 assert( leafCorrection==0 ); 008420 assert( pOld->hdrOffset==0 || CORRUPT_DB ); 008421 /* The right pointer of the child page pOld becomes the left 008422 ** pointer of the divider cell */ 008423 memcpy(b.apCell[b.nCell], &pOld->aData[8], 4); 008424 }else{ 008425 assert( leafCorrection==4 ); 008426 while( b.szCell[b.nCell]<4 ){ 008427 /* Do not allow any cells smaller than 4 bytes. If a smaller cell 008428 ** does exist, pad it with 0x00 bytes. */ 008429 assert( b.szCell[b.nCell]==3 || CORRUPT_DB ); 008430 assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB ); 008431 aSpace1[iSpace1++] = 0x00; 008432 b.szCell[b.nCell]++; 008433 } 008434 } 008435 b.nCell++; 008436 } 008437 } 008438 008439 /* 008440 ** Figure out the number of pages needed to hold all b.nCell cells. 008441 ** Store this number in "k". Also compute szNew[] which is the total 008442 ** size of all cells on the i-th page and cntNew[] which is the index 008443 ** in b.apCell[] of the cell that divides page i from page i+1. 008444 ** cntNew[k] should equal b.nCell. 008445 ** 008446 ** Values computed by this block: 008447 ** 008448 ** k: The total number of sibling pages 008449 ** szNew[i]: Spaced used on the i-th sibling page. 008450 ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to 008451 ** the right of the i-th sibling page. 008452 ** usableSpace: Number of bytes of space available on each sibling. 008453 ** 008454 */ 008455 usableSpace = pBt->usableSize - 12 + leafCorrection; 008456 for(i=k=0; i<nOld; i++, k++){ 008457 MemPage *p = apOld[i]; 008458 b.apEnd[k] = p->aDataEnd; 008459 b.ixNx[k] = cntOld[i]; 008460 if( k && b.ixNx[k]==b.ixNx[k-1] ){ 008461 k--; /* Omit b.ixNx[] entry for child pages with no cells */ 008462 } 008463 if( !leafData ){ 008464 k++; 008465 b.apEnd[k] = pParent->aDataEnd; 008466 b.ixNx[k] = cntOld[i]+1; 008467 } 008468 assert( p->nFree>=0 ); 008469 szNew[i] = usableSpace - p->nFree; 008470 for(j=0; j<p->nOverflow; j++){ 008471 szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]); 008472 } 008473 cntNew[i] = cntOld[i]; 008474 } 008475 k = nOld; 008476 for(i=0; i<k; i++){ 008477 int sz; 008478 while( szNew[i]>usableSpace ){ 008479 if( i+1>=k ){ 008480 k = i+2; 008481 if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } 008482 szNew[k-1] = 0; 008483 cntNew[k-1] = b.nCell; 008484 } 008485 sz = 2 + cachedCellSize(&b, cntNew[i]-1); 008486 szNew[i] -= sz; 008487 if( !leafData ){ 008488 if( cntNew[i]<b.nCell ){ 008489 sz = 2 + cachedCellSize(&b, cntNew[i]); 008490 }else{ 008491 sz = 0; 008492 } 008493 } 008494 szNew[i+1] += sz; 008495 cntNew[i]--; 008496 } 008497 while( cntNew[i]<b.nCell ){ 008498 sz = 2 + cachedCellSize(&b, cntNew[i]); 008499 if( szNew[i]+sz>usableSpace ) break; 008500 szNew[i] += sz; 008501 cntNew[i]++; 008502 if( !leafData ){ 008503 if( cntNew[i]<b.nCell ){ 008504 sz = 2 + cachedCellSize(&b, cntNew[i]); 008505 }else{ 008506 sz = 0; 008507 } 008508 } 008509 szNew[i+1] -= sz; 008510 } 008511 if( cntNew[i]>=b.nCell ){ 008512 k = i+1; 008513 }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){ 008514 rc = SQLITE_CORRUPT_BKPT; 008515 goto balance_cleanup; 008516 } 008517 } 008518 008519 /* 008520 ** The packing computed by the previous block is biased toward the siblings 008521 ** on the left side (siblings with smaller keys). The left siblings are 008522 ** always nearly full, while the right-most sibling might be nearly empty. 008523 ** The next block of code attempts to adjust the packing of siblings to 008524 ** get a better balance. 008525 ** 008526 ** This adjustment is more than an optimization. The packing above might 008527 ** be so out of balance as to be illegal. For example, the right-most 008528 ** sibling might be completely empty. This adjustment is not optional. 008529 */ 008530 for(i=k-1; i>0; i--){ 008531 int szRight = szNew[i]; /* Size of sibling on the right */ 008532 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 008533 int r; /* Index of right-most cell in left sibling */ 008534 int d; /* Index of first cell to the left of right sibling */ 008535 008536 r = cntNew[i-1] - 1; 008537 d = r + 1 - leafData; 008538 (void)cachedCellSize(&b, d); 008539 do{ 008540 int szR, szD; 008541 assert( d<nMaxCells ); 008542 assert( r<nMaxCells ); 008543 szR = cachedCellSize(&b, r); 008544 szD = b.szCell[d]; 008545 if( szRight!=0 008546 && (bBulk || szRight+szD+2 > szLeft-(szR+(i==k-1?0:2)))){ 008547 break; 008548 } 008549 szRight += szD + 2; 008550 szLeft -= szR + 2; 008551 cntNew[i-1] = r; 008552 r--; 008553 d--; 008554 }while( r>=0 ); 008555 szNew[i] = szRight; 008556 szNew[i-1] = szLeft; 008557 if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){ 008558 rc = SQLITE_CORRUPT_BKPT; 008559 goto balance_cleanup; 008560 } 008561 } 008562 008563 /* Sanity check: For a non-corrupt database file one of the following 008564 ** must be true: 008565 ** (1) We found one or more cells (cntNew[0])>0), or 008566 ** (2) pPage is a virtual root page. A virtual root page is when 008567 ** the real root page is page 1 and we are the only child of 008568 ** that page. 008569 */ 008570 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB); 008571 TRACE(("BALANCE: old: %u(nc=%u) %u(nc=%u) %u(nc=%u)\n", 008572 apOld[0]->pgno, apOld[0]->nCell, 008573 nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0, 008574 nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0 008575 )); 008576 008577 /* 008578 ** Allocate k new pages. Reuse old pages where possible. 008579 */ 008580 pageFlags = apOld[0]->aData[0]; 008581 for(i=0; i<k; i++){ 008582 MemPage *pNew; 008583 if( i<nOld ){ 008584 pNew = apNew[i] = apOld[i]; 008585 apOld[i] = 0; 008586 rc = sqlite3PagerWrite(pNew->pDbPage); 008587 nNew++; 008588 if( sqlite3PagerPageRefcount(pNew->pDbPage)!=1+(i==(iParentIdx-nxDiv)) 008589 && rc==SQLITE_OK 008590 ){ 008591 rc = SQLITE_CORRUPT_BKPT; 008592 } 008593 if( rc ) goto balance_cleanup; 008594 }else{ 008595 assert( i>0 ); 008596 rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); 008597 if( rc ) goto balance_cleanup; 008598 zeroPage(pNew, pageFlags); 008599 apNew[i] = pNew; 008600 nNew++; 008601 cntOld[i] = b.nCell; 008602 008603 /* Set the pointer-map entry for the new sibling page. */ 008604 if( ISAUTOVACUUM(pBt) ){ 008605 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); 008606 if( rc!=SQLITE_OK ){ 008607 goto balance_cleanup; 008608 } 008609 } 008610 } 008611 } 008612 008613 /* 008614 ** Reassign page numbers so that the new pages are in ascending order. 008615 ** This helps to keep entries in the disk file in order so that a scan 008616 ** of the table is closer to a linear scan through the file. That in turn 008617 ** helps the operating system to deliver pages from the disk more rapidly. 008618 ** 008619 ** An O(N*N) sort algorithm is used, but since N is never more than NB+2 008620 ** (5), that is not a performance concern. 008621 ** 008622 ** When NB==3, this one optimization makes the database about 25% faster 008623 ** for large insertions and deletions. 008624 */ 008625 for(i=0; i<nNew; i++){ 008626 aPgno[i] = apNew[i]->pgno; 008627 assert( apNew[i]->pDbPage->flags & PGHDR_WRITEABLE ); 008628 assert( apNew[i]->pDbPage->flags & PGHDR_DIRTY ); 008629 } 008630 for(i=0; i<nNew-1; i++){ 008631 int iB = i; 008632 for(j=i+1; j<nNew; j++){ 008633 if( apNew[j]->pgno < apNew[iB]->pgno ) iB = j; 008634 } 008635 008636 /* If apNew[i] has a page number that is bigger than any of the 008637 ** subsequence apNew[i] entries, then swap apNew[i] with the subsequent 008638 ** entry that has the smallest page number (which we know to be 008639 ** entry apNew[iB]). 008640 */ 008641 if( iB!=i ){ 008642 Pgno pgnoA = apNew[i]->pgno; 008643 Pgno pgnoB = apNew[iB]->pgno; 008644 Pgno pgnoTemp = (PENDING_BYTE/pBt->pageSize)+1; 008645 u16 fgA = apNew[i]->pDbPage->flags; 008646 u16 fgB = apNew[iB]->pDbPage->flags; 008647 sqlite3PagerRekey(apNew[i]->pDbPage, pgnoTemp, fgB); 008648 sqlite3PagerRekey(apNew[iB]->pDbPage, pgnoA, fgA); 008649 sqlite3PagerRekey(apNew[i]->pDbPage, pgnoB, fgB); 008650 apNew[i]->pgno = pgnoB; 008651 apNew[iB]->pgno = pgnoA; 008652 } 008653 } 008654 008655 TRACE(("BALANCE: new: %u(%u nc=%u) %u(%u nc=%u) %u(%u nc=%u) " 008656 "%u(%u nc=%u) %u(%u nc=%u)\n", 008657 apNew[0]->pgno, szNew[0], cntNew[0], 008658 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, 008659 nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0, 008660 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, 008661 nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0, 008662 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, 008663 nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0, 008664 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0, 008665 nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0 008666 )); 008667 008668 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008669 assert( nNew>=1 && nNew<=ArraySize(apNew) ); 008670 assert( apNew[nNew-1]!=0 ); 008671 put4byte(pRight, apNew[nNew-1]->pgno); 008672 008673 /* If the sibling pages are not leaves, ensure that the right-child pointer 008674 ** of the right-most new sibling page is set to the value that was 008675 ** originally in the same field of the right-most old sibling page. */ 008676 if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){ 008677 MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1]; 008678 memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4); 008679 } 008680 008681 /* Make any required updates to pointer map entries associated with 008682 ** cells stored on sibling pages following the balance operation. Pointer 008683 ** map entries associated with divider cells are set by the insertCell() 008684 ** routine. The associated pointer map entries are: 008685 ** 008686 ** a) if the cell contains a reference to an overflow chain, the 008687 ** entry associated with the first page in the overflow chain, and 008688 ** 008689 ** b) if the sibling pages are not leaves, the child page associated 008690 ** with the cell. 008691 ** 008692 ** If the sibling pages are not leaves, then the pointer map entry 008693 ** associated with the right-child of each sibling may also need to be 008694 ** updated. This happens below, after the sibling pages have been 008695 ** populated, not here. 008696 */ 008697 if( ISAUTOVACUUM(pBt) ){ 008698 MemPage *pOld; 008699 MemPage *pNew = pOld = apNew[0]; 008700 int cntOldNext = pNew->nCell + pNew->nOverflow; 008701 int iNew = 0; 008702 int iOld = 0; 008703 008704 for(i=0; i<b.nCell; i++){ 008705 u8 *pCell = b.apCell[i]; 008706 while( i==cntOldNext ){ 008707 iOld++; 008708 assert( iOld<nNew || iOld<nOld ); 008709 assert( iOld>=0 && iOld<NB ); 008710 pOld = iOld<nNew ? apNew[iOld] : apOld[iOld]; 008711 cntOldNext += pOld->nCell + pOld->nOverflow + !leafData; 008712 } 008713 if( i==cntNew[iNew] ){ 008714 pNew = apNew[++iNew]; 008715 if( !leafData ) continue; 008716 } 008717 008718 /* Cell pCell is destined for new sibling page pNew. Originally, it 008719 ** was either part of sibling page iOld (possibly an overflow cell), 008720 ** or else the divider cell to the left of sibling page iOld. So, 008721 ** if sibling page iOld had the same page number as pNew, and if 008722 ** pCell really was a part of sibling page iOld (not a divider or 008723 ** overflow cell), we can skip updating the pointer map entries. */ 008724 if( iOld>=nNew 008725 || pNew->pgno!=aPgno[iOld] 008726 || !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd) 008727 ){ 008728 if( !leafCorrection ){ 008729 ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc); 008730 } 008731 if( cachedCellSize(&b,i)>pNew->minLocal ){ 008732 ptrmapPutOvflPtr(pNew, pOld, pCell, &rc); 008733 } 008734 if( rc ) goto balance_cleanup; 008735 } 008736 } 008737 } 008738 008739 /* Insert new divider cells into pParent. */ 008740 for(i=0; i<nNew-1; i++){ 008741 u8 *pCell; 008742 u8 *pTemp; 008743 int sz; 008744 u8 *pSrcEnd; 008745 MemPage *pNew = apNew[i]; 008746 j = cntNew[i]; 008747 008748 assert( j<nMaxCells ); 008749 assert( b.apCell[j]!=0 ); 008750 pCell = b.apCell[j]; 008751 sz = b.szCell[j] + leafCorrection; 008752 pTemp = &aOvflSpace[iOvflSpace]; 008753 if( !pNew->leaf ){ 008754 memcpy(&pNew->aData[8], pCell, 4); 008755 }else if( leafData ){ 008756 /* If the tree is a leaf-data tree, and the siblings are leaves, 008757 ** then there is no divider cell in b.apCell[]. Instead, the divider 008758 ** cell consists of the integer key for the right-most cell of 008759 ** the sibling-page assembled above only. 008760 */ 008761 CellInfo info; 008762 j--; 008763 pNew->xParseCell(pNew, b.apCell[j], &info); 008764 pCell = pTemp; 008765 sz = 4 + putVarint(&pCell[4], info.nKey); 008766 pTemp = 0; 008767 }else{ 008768 pCell -= 4; 008769 /* Obscure case for non-leaf-data trees: If the cell at pCell was 008770 ** previously stored on a leaf node, and its reported size was 4 008771 ** bytes, then it may actually be smaller than this 008772 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of 008773 ** any cell). But it is important to pass the correct size to 008774 ** insertCell(), so reparse the cell now. 008775 ** 008776 ** This can only happen for b-trees used to evaluate "IN (SELECT ...)" 008777 ** and WITHOUT ROWID tables with exactly one column which is the 008778 ** primary key. 008779 */ 008780 if( b.szCell[j]==4 ){ 008781 assert(leafCorrection==4); 008782 sz = pParent->xCellSize(pParent, pCell); 008783 } 008784 } 008785 iOvflSpace += sz; 008786 assert( sz<=pBt->maxLocal+23 ); 008787 assert( iOvflSpace <= (int)pBt->pageSize ); 008788 assert( b.ixNx[NB*2-1]>j ); 008789 for(k=0; b.ixNx[k]<=j; k++){} 008790 pSrcEnd = b.apEnd[k]; 008791 if( SQLITE_OVERFLOW(pSrcEnd, pCell, pCell+sz) ){ 008792 rc = SQLITE_CORRUPT_BKPT; 008793 goto balance_cleanup; 008794 } 008795 rc = insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno); 008796 if( rc!=SQLITE_OK ) goto balance_cleanup; 008797 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008798 } 008799 008800 /* Now update the actual sibling pages. The order in which they are updated 008801 ** is important, as this code needs to avoid disrupting any page from which 008802 ** cells may still to be read. In practice, this means: 008803 ** 008804 ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1]) 008805 ** then it is not safe to update page apNew[iPg] until after 008806 ** the left-hand sibling apNew[iPg-1] has been updated. 008807 ** 008808 ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1]) 008809 ** then it is not safe to update page apNew[iPg] until after 008810 ** the right-hand sibling apNew[iPg+1] has been updated. 008811 ** 008812 ** If neither of the above apply, the page is safe to update. 008813 ** 008814 ** The iPg value in the following loop starts at nNew-1 goes down 008815 ** to 0, then back up to nNew-1 again, thus making two passes over 008816 ** the pages. On the initial downward pass, only condition (1) above 008817 ** needs to be tested because (2) will always be true from the previous 008818 ** step. On the upward pass, both conditions are always true, so the 008819 ** upwards pass simply processes pages that were missed on the downward 008820 ** pass. 008821 */ 008822 for(i=1-nNew; i<nNew; i++){ 008823 int iPg = i<0 ? -i : i; 008824 assert( iPg>=0 && iPg<nNew ); 008825 assert( iPg>=1 || i>=0 ); 008826 assert( iPg<ArraySize(cntOld) ); 008827 if( abDone[iPg] ) continue; /* Skip pages already processed */ 008828 if( i>=0 /* On the upwards pass, or... */ 008829 || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */ 008830 ){ 008831 int iNew; 008832 int iOld; 008833 int nNewCell; 008834 008835 /* Verify condition (1): If cells are moving left, update iPg 008836 ** only after iPg-1 has already been updated. */ 008837 assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] ); 008838 008839 /* Verify condition (2): If cells are moving right, update iPg 008840 ** only after iPg+1 has already been updated. */ 008841 assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] ); 008842 008843 if( iPg==0 ){ 008844 iNew = iOld = 0; 008845 nNewCell = cntNew[0]; 008846 }else{ 008847 iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell; 008848 iNew = cntNew[iPg-1] + !leafData; 008849 nNewCell = cntNew[iPg] - iNew; 008850 } 008851 008852 rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b); 008853 if( rc ) goto balance_cleanup; 008854 abDone[iPg]++; 008855 apNew[iPg]->nFree = usableSpace-szNew[iPg]; 008856 assert( apNew[iPg]->nOverflow==0 ); 008857 assert( apNew[iPg]->nCell==nNewCell ); 008858 } 008859 } 008860 008861 /* All pages have been processed exactly once */ 008862 assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 ); 008863 008864 assert( nOld>0 ); 008865 assert( nNew>0 ); 008866 008867 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ 008868 /* The root page of the b-tree now contains no cells. The only sibling 008869 ** page is the right-child of the parent. Copy the contents of the 008870 ** child page into the parent, decreasing the overall height of the 008871 ** b-tree structure by one. This is described as the "balance-shallower" 008872 ** sub-algorithm in some documentation. 008873 ** 008874 ** If this is an auto-vacuum database, the call to copyNodeContent() 008875 ** sets all pointer-map entries corresponding to database image pages 008876 ** for which the pointer is stored within the content being copied. 008877 ** 008878 ** It is critical that the child page be defragmented before being 008879 ** copied into the parent, because if the parent is page 1 then it will 008880 ** by smaller than the child due to the database header, and so all the 008881 ** free space needs to be up front. 008882 */ 008883 assert( nNew==1 || CORRUPT_DB ); 008884 rc = defragmentPage(apNew[0], -1); 008885 testcase( rc!=SQLITE_OK ); 008886 assert( apNew[0]->nFree == 008887 (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset 008888 - apNew[0]->nCell*2) 008889 || rc!=SQLITE_OK 008890 ); 008891 copyNodeContent(apNew[0], pParent, &rc); 008892 freePage(apNew[0], &rc); 008893 }else if( ISAUTOVACUUM(pBt) && !leafCorrection ){ 008894 /* Fix the pointer map entries associated with the right-child of each 008895 ** sibling page. All other pointer map entries have already been taken 008896 ** care of. */ 008897 for(i=0; i<nNew; i++){ 008898 u32 key = get4byte(&apNew[i]->aData[8]); 008899 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); 008900 } 008901 } 008902 008903 assert( pParent->isInit ); 008904 TRACE(("BALANCE: finished: old=%u new=%u cells=%u\n", 008905 nOld, nNew, b.nCell)); 008906 008907 /* Free any old pages that were not reused as new pages. 008908 */ 008909 for(i=nNew; i<nOld; i++){ 008910 freePage(apOld[i], &rc); 008911 } 008912 008913 #if 0 008914 if( ISAUTOVACUUM(pBt) && rc==SQLITE_OK && apNew[0]->isInit ){ 008915 /* The ptrmapCheckPages() contains assert() statements that verify that 008916 ** all pointer map pages are set correctly. This is helpful while 008917 ** debugging. This is usually disabled because a corrupt database may 008918 ** cause an assert() statement to fail. */ 008919 ptrmapCheckPages(apNew, nNew); 008920 ptrmapCheckPages(&pParent, 1); 008921 } 008922 #endif 008923 008924 /* 008925 ** Cleanup before returning. 008926 */ 008927 balance_cleanup: 008928 sqlite3StackFree(0, b.apCell); 008929 for(i=0; i<nOld; i++){ 008930 releasePage(apOld[i]); 008931 } 008932 for(i=0; i<nNew; i++){ 008933 releasePage(apNew[i]); 008934 } 008935 008936 return rc; 008937 } 008938 008939 008940 /* 008941 ** This function is called when the root page of a b-tree structure is 008942 ** overfull (has one or more overflow pages). 008943 ** 008944 ** A new child page is allocated and the contents of the current root 008945 ** page, including overflow cells, are copied into the child. The root 008946 ** page is then overwritten to make it an empty page with the right-child 008947 ** pointer pointing to the new page. 008948 ** 008949 ** Before returning, all pointer-map entries corresponding to pages 008950 ** that the new child-page now contains pointers to are updated. The 008951 ** entry corresponding to the new right-child pointer of the root 008952 ** page is also updated. 008953 ** 008954 ** If successful, *ppChild is set to contain a reference to the child 008955 ** page and SQLITE_OK is returned. In this case the caller is required 008956 ** to call releasePage() on *ppChild exactly once. If an error occurs, 008957 ** an error code is returned and *ppChild is set to 0. 008958 */ 008959 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ 008960 int rc; /* Return value from subprocedures */ 008961 MemPage *pChild = 0; /* Pointer to a new child page */ 008962 Pgno pgnoChild = 0; /* Page number of the new child page */ 008963 BtShared *pBt = pRoot->pBt; /* The BTree */ 008964 008965 assert( pRoot->nOverflow>0 ); 008966 assert( sqlite3_mutex_held(pBt->mutex) ); 008967 008968 /* Make pRoot, the root page of the b-tree, writable. Allocate a new 008969 ** page that will become the new right-child of pPage. Copy the contents 008970 ** of the node stored on pRoot into the new child page. 008971 */ 008972 rc = sqlite3PagerWrite(pRoot->pDbPage); 008973 if( rc==SQLITE_OK ){ 008974 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0); 008975 copyNodeContent(pRoot, pChild, &rc); 008976 if( ISAUTOVACUUM(pBt) ){ 008977 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc); 008978 } 008979 } 008980 if( rc ){ 008981 *ppChild = 0; 008982 releasePage(pChild); 008983 return rc; 008984 } 008985 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 008986 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 008987 assert( pChild->nCell==pRoot->nCell || CORRUPT_DB ); 008988 008989 TRACE(("BALANCE: copy root %u into %u\n", pRoot->pgno, pChild->pgno)); 008990 008991 /* Copy the overflow cells from pRoot to pChild */ 008992 memcpy(pChild->aiOvfl, pRoot->aiOvfl, 008993 pRoot->nOverflow*sizeof(pRoot->aiOvfl[0])); 008994 memcpy(pChild->apOvfl, pRoot->apOvfl, 008995 pRoot->nOverflow*sizeof(pRoot->apOvfl[0])); 008996 pChild->nOverflow = pRoot->nOverflow; 008997 008998 /* Zero the contents of pRoot. Then install pChild as the right-child. */ 008999 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF); 009000 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild); 009001 009002 *ppChild = pChild; 009003 return SQLITE_OK; 009004 } 009005 009006 /* 009007 ** Return SQLITE_CORRUPT if any cursor other than pCur is currently valid 009008 ** on the same B-tree as pCur. 009009 ** 009010 ** This can occur if a database is corrupt with two or more SQL tables 009011 ** pointing to the same b-tree. If an insert occurs on one SQL table 009012 ** and causes a BEFORE TRIGGER to do a secondary insert on the other SQL 009013 ** table linked to the same b-tree. If the secondary insert causes a 009014 ** rebalance, that can change content out from under the cursor on the 009015 ** first SQL table, violating invariants on the first insert. 009016 */ 009017 static int anotherValidCursor(BtCursor *pCur){ 009018 BtCursor *pOther; 009019 for(pOther=pCur->pBt->pCursor; pOther; pOther=pOther->pNext){ 009020 if( pOther!=pCur 009021 && pOther->eState==CURSOR_VALID 009022 && pOther->pPage==pCur->pPage 009023 ){ 009024 return SQLITE_CORRUPT_PAGE(pCur->pPage); 009025 } 009026 } 009027 return SQLITE_OK; 009028 } 009029 009030 /* 009031 ** The page that pCur currently points to has just been modified in 009032 ** some way. This function figures out if this modification means the 009033 ** tree needs to be balanced, and if so calls the appropriate balancing 009034 ** routine. Balancing routines are: 009035 ** 009036 ** balance_quick() 009037 ** balance_deeper() 009038 ** balance_nonroot() 009039 */ 009040 static int balance(BtCursor *pCur){ 009041 int rc = SQLITE_OK; 009042 u8 aBalanceQuickSpace[13]; 009043 u8 *pFree = 0; 009044 009045 VVA_ONLY( int balance_quick_called = 0 ); 009046 VVA_ONLY( int balance_deeper_called = 0 ); 009047 009048 do { 009049 int iPage; 009050 MemPage *pPage = pCur->pPage; 009051 009052 if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break; 009053 if( pPage->nOverflow==0 && pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){ 009054 /* No rebalance required as long as: 009055 ** (1) There are no overflow cells 009056 ** (2) The amount of free space on the page is less than 2/3rds of 009057 ** the total usable space on the page. */ 009058 break; 009059 }else if( (iPage = pCur->iPage)==0 ){ 009060 if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){ 009061 /* The root page of the b-tree is overfull. In this case call the 009062 ** balance_deeper() function to create a new child for the root-page 009063 ** and copy the current contents of the root-page to it. The 009064 ** next iteration of the do-loop will balance the child page. 009065 */ 009066 assert( balance_deeper_called==0 ); 009067 VVA_ONLY( balance_deeper_called++ ); 009068 rc = balance_deeper(pPage, &pCur->apPage[1]); 009069 if( rc==SQLITE_OK ){ 009070 pCur->iPage = 1; 009071 pCur->ix = 0; 009072 pCur->aiIdx[0] = 0; 009073 pCur->apPage[0] = pPage; 009074 pCur->pPage = pCur->apPage[1]; 009075 assert( pCur->pPage->nOverflow ); 009076 } 009077 }else{ 009078 break; 009079 } 009080 }else if( sqlite3PagerPageRefcount(pPage->pDbPage)>1 ){ 009081 /* The page being written is not a root page, and there is currently 009082 ** more than one reference to it. This only happens if the page is one 009083 ** of its own ancestor pages. Corruption. */ 009084 rc = SQLITE_CORRUPT_PAGE(pPage); 009085 }else{ 009086 MemPage * const pParent = pCur->apPage[iPage-1]; 009087 int const iIdx = pCur->aiIdx[iPage-1]; 009088 009089 rc = sqlite3PagerWrite(pParent->pDbPage); 009090 if( rc==SQLITE_OK && pParent->nFree<0 ){ 009091 rc = btreeComputeFreeSpace(pParent); 009092 } 009093 if( rc==SQLITE_OK ){ 009094 #ifndef SQLITE_OMIT_QUICKBALANCE 009095 if( pPage->intKeyLeaf 009096 && pPage->nOverflow==1 009097 && pPage->aiOvfl[0]==pPage->nCell 009098 && pParent->pgno!=1 009099 && pParent->nCell==iIdx 009100 ){ 009101 /* Call balance_quick() to create a new sibling of pPage on which 009102 ** to store the overflow cell. balance_quick() inserts a new cell 009103 ** into pParent, which may cause pParent overflow. If this 009104 ** happens, the next iteration of the do-loop will balance pParent 009105 ** use either balance_nonroot() or balance_deeper(). Until this 009106 ** happens, the overflow cell is stored in the aBalanceQuickSpace[] 009107 ** buffer. 009108 ** 009109 ** The purpose of the following assert() is to check that only a 009110 ** single call to balance_quick() is made for each call to this 009111 ** function. If this were not verified, a subtle bug involving reuse 009112 ** of the aBalanceQuickSpace[] might sneak in. 009113 */ 009114 assert( balance_quick_called==0 ); 009115 VVA_ONLY( balance_quick_called++ ); 009116 rc = balance_quick(pParent, pPage, aBalanceQuickSpace); 009117 }else 009118 #endif 009119 { 009120 /* In this case, call balance_nonroot() to redistribute cells 009121 ** between pPage and up to 2 of its sibling pages. This involves 009122 ** modifying the contents of pParent, which may cause pParent to 009123 ** become overfull or underfull. The next iteration of the do-loop 009124 ** will balance the parent page to correct this. 009125 ** 009126 ** If the parent page becomes overfull, the overflow cell or cells 009127 ** are stored in the pSpace buffer allocated immediately below. 009128 ** A subsequent iteration of the do-loop will deal with this by 009129 ** calling balance_nonroot() (balance_deeper() may be called first, 009130 ** but it doesn't deal with overflow cells - just moves them to a 009131 ** different page). Once this subsequent call to balance_nonroot() 009132 ** has completed, it is safe to release the pSpace buffer used by 009133 ** the previous call, as the overflow cell data will have been 009134 ** copied either into the body of a database page or into the new 009135 ** pSpace buffer passed to the latter call to balance_nonroot(). 009136 */ 009137 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); 009138 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, 009139 pCur->hints&BTREE_BULKLOAD); 009140 if( pFree ){ 009141 /* If pFree is not NULL, it points to the pSpace buffer used 009142 ** by a previous call to balance_nonroot(). Its contents are 009143 ** now stored either on real database pages or within the 009144 ** new pSpace buffer, so it may be safely freed here. */ 009145 sqlite3PageFree(pFree); 009146 } 009147 009148 /* The pSpace buffer will be freed after the next call to 009149 ** balance_nonroot(), or just before this function returns, whichever 009150 ** comes first. */ 009151 pFree = pSpace; 009152 } 009153 } 009154 009155 pPage->nOverflow = 0; 009156 009157 /* The next iteration of the do-loop balances the parent page. */ 009158 releasePage(pPage); 009159 pCur->iPage--; 009160 assert( pCur->iPage>=0 ); 009161 pCur->pPage = pCur->apPage[pCur->iPage]; 009162 } 009163 }while( rc==SQLITE_OK ); 009164 009165 if( pFree ){ 009166 sqlite3PageFree(pFree); 009167 } 009168 return rc; 009169 } 009170 009171 /* Overwrite content from pX into pDest. Only do the write if the 009172 ** content is different from what is already there. 009173 */ 009174 static int btreeOverwriteContent( 009175 MemPage *pPage, /* MemPage on which writing will occur */ 009176 u8 *pDest, /* Pointer to the place to start writing */ 009177 const BtreePayload *pX, /* Source of data to write */ 009178 int iOffset, /* Offset of first byte to write */ 009179 int iAmt /* Number of bytes to be written */ 009180 ){ 009181 int nData = pX->nData - iOffset; 009182 if( nData<=0 ){ 009183 /* Overwriting with zeros */ 009184 int i; 009185 for(i=0; i<iAmt && pDest[i]==0; i++){} 009186 if( i<iAmt ){ 009187 int rc = sqlite3PagerWrite(pPage->pDbPage); 009188 if( rc ) return rc; 009189 memset(pDest + i, 0, iAmt - i); 009190 } 009191 }else{ 009192 if( nData<iAmt ){ 009193 /* Mixed read data and zeros at the end. Make a recursive call 009194 ** to write the zeros then fall through to write the real data */ 009195 int rc = btreeOverwriteContent(pPage, pDest+nData, pX, iOffset+nData, 009196 iAmt-nData); 009197 if( rc ) return rc; 009198 iAmt = nData; 009199 } 009200 if( memcmp(pDest, ((u8*)pX->pData) + iOffset, iAmt)!=0 ){ 009201 int rc = sqlite3PagerWrite(pPage->pDbPage); 009202 if( rc ) return rc; 009203 /* In a corrupt database, it is possible for the source and destination 009204 ** buffers to overlap. This is harmless since the database is already 009205 ** corrupt but it does cause valgrind and ASAN warnings. So use 009206 ** memmove(). */ 009207 memmove(pDest, ((u8*)pX->pData) + iOffset, iAmt); 009208 } 009209 } 009210 return SQLITE_OK; 009211 } 009212 009213 /* 009214 ** Overwrite the cell that cursor pCur is pointing to with fresh content 009215 ** contained in pX. In this variant, pCur is pointing to an overflow 009216 ** cell. 009217 */ 009218 static SQLITE_NOINLINE int btreeOverwriteOverflowCell( 009219 BtCursor *pCur, /* Cursor pointing to cell to overwrite */ 009220 const BtreePayload *pX /* Content to write into the cell */ 009221 ){ 009222 int iOffset; /* Next byte of pX->pData to write */ 009223 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */ 009224 int rc; /* Return code */ 009225 MemPage *pPage = pCur->pPage; /* Page being written */ 009226 BtShared *pBt; /* Btree */ 009227 Pgno ovflPgno; /* Next overflow page to write */ 009228 u32 ovflPageSize; /* Size to write on overflow page */ 009229 009230 assert( pCur->info.nLocal<nTotal ); /* pCur is an overflow cell */ 009231 009232 /* Overwrite the local portion first */ 009233 rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX, 009234 0, pCur->info.nLocal); 009235 if( rc ) return rc; 009236 009237 /* Now overwrite the overflow pages */ 009238 iOffset = pCur->info.nLocal; 009239 assert( nTotal>=0 ); 009240 assert( iOffset>=0 ); 009241 ovflPgno = get4byte(pCur->info.pPayload + iOffset); 009242 pBt = pPage->pBt; 009243 ovflPageSize = pBt->usableSize - 4; 009244 do{ 009245 rc = btreeGetPage(pBt, ovflPgno, &pPage, 0); 009246 if( rc ) return rc; 009247 if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 || pPage->isInit ){ 009248 rc = SQLITE_CORRUPT_PAGE(pPage); 009249 }else{ 009250 if( iOffset+ovflPageSize<(u32)nTotal ){ 009251 ovflPgno = get4byte(pPage->aData); 009252 }else{ 009253 ovflPageSize = nTotal - iOffset; 009254 } 009255 rc = btreeOverwriteContent(pPage, pPage->aData+4, pX, 009256 iOffset, ovflPageSize); 009257 } 009258 sqlite3PagerUnref(pPage->pDbPage); 009259 if( rc ) return rc; 009260 iOffset += ovflPageSize; 009261 }while( iOffset<nTotal ); 009262 return SQLITE_OK; 009263 } 009264 009265 /* 009266 ** Overwrite the cell that cursor pCur is pointing to with fresh content 009267 ** contained in pX. 009268 */ 009269 static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){ 009270 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */ 009271 MemPage *pPage = pCur->pPage; /* Page being written */ 009272 009273 if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd 009274 || pCur->info.pPayload < pPage->aData + pPage->cellOffset 009275 ){ 009276 return SQLITE_CORRUPT_PAGE(pPage); 009277 } 009278 if( pCur->info.nLocal==nTotal ){ 009279 /* The entire cell is local */ 009280 return btreeOverwriteContent(pPage, pCur->info.pPayload, pX, 009281 0, pCur->info.nLocal); 009282 }else{ 009283 /* The cell contains overflow content */ 009284 return btreeOverwriteOverflowCell(pCur, pX); 009285 } 009286 } 009287 009288 009289 /* 009290 ** Insert a new record into the BTree. The content of the new record 009291 ** is described by the pX object. The pCur cursor is used only to 009292 ** define what table the record should be inserted into, and is left 009293 ** pointing at a random location. 009294 ** 009295 ** For a table btree (used for rowid tables), only the pX.nKey value of 009296 ** the key is used. The pX.pKey value must be NULL. The pX.nKey is the 009297 ** rowid or INTEGER PRIMARY KEY of the row. The pX.nData,pData,nZero fields 009298 ** hold the content of the row. 009299 ** 009300 ** For an index btree (used for indexes and WITHOUT ROWID tables), the 009301 ** key is an arbitrary byte sequence stored in pX.pKey,nKey. The 009302 ** pX.pData,nData,nZero fields must be zero. 009303 ** 009304 ** If the seekResult parameter is non-zero, then a successful call to 009305 ** sqlite3BtreeIndexMoveto() to seek cursor pCur to (pKey,nKey) has already 009306 ** been performed. In other words, if seekResult!=0 then the cursor 009307 ** is currently pointing to a cell that will be adjacent to the cell 009308 ** to be inserted. If seekResult<0 then pCur points to a cell that is 009309 ** smaller then (pKey,nKey). If seekResult>0 then pCur points to a cell 009310 ** that is larger than (pKey,nKey). 009311 ** 009312 ** If seekResult==0, that means pCur is pointing at some unknown location. 009313 ** In that case, this routine must seek the cursor to the correct insertion 009314 ** point for (pKey,nKey) before doing the insertion. For index btrees, 009315 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked 009316 ** key values and pX->aMem can be used instead of pX->pKey to avoid having 009317 ** to decode the key. 009318 */ 009319 int sqlite3BtreeInsert( 009320 BtCursor *pCur, /* Insert data into the table of this cursor */ 009321 const BtreePayload *pX, /* Content of the row to be inserted */ 009322 int flags, /* True if this is likely an append */ 009323 int seekResult /* Result of prior IndexMoveto() call */ 009324 ){ 009325 int rc; 009326 int loc = seekResult; /* -1: before desired location +1: after */ 009327 int szNew = 0; 009328 int idx; 009329 MemPage *pPage; 009330 Btree *p = pCur->pBtree; 009331 unsigned char *oldCell; 009332 unsigned char *newCell = 0; 009333 009334 assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND|BTREE_PREFORMAT))==flags ); 009335 assert( (flags & BTREE_PREFORMAT)==0 || seekResult || pCur->pKeyInfo==0 ); 009336 009337 /* Save the positions of any other cursors open on this table. 009338 ** 009339 ** In some cases, the call to btreeMoveto() below is a no-op. For 009340 ** example, when inserting data into a table with auto-generated integer 009341 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 009342 ** integer key to use. It then calls this function to actually insert the 009343 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes 009344 ** that the cursor is already where it needs to be and returns without 009345 ** doing any work. To avoid thwarting these optimizations, it is important 009346 ** not to clear the cursor here. 009347 */ 009348 if( pCur->curFlags & BTCF_Multiple ){ 009349 rc = saveAllCursors(p->pBt, pCur->pgnoRoot, pCur); 009350 if( rc ) return rc; 009351 if( loc && pCur->iPage<0 ){ 009352 /* This can only happen if the schema is corrupt such that there is more 009353 ** than one table or index with the same root page as used by the cursor. 009354 ** Which can only happen if the SQLITE_NoSchemaError flag was set when 009355 ** the schema was loaded. This cannot be asserted though, as a user might 009356 ** set the flag, load the schema, and then unset the flag. */ 009357 return SQLITE_CORRUPT_PGNO(pCur->pgnoRoot); 009358 } 009359 } 009360 009361 /* Ensure that the cursor is not in the CURSOR_FAULT state and that it 009362 ** points to a valid cell. 009363 */ 009364 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 009365 testcase( pCur->eState==CURSOR_REQUIRESEEK ); 009366 testcase( pCur->eState==CURSOR_FAULT ); 009367 rc = moveToRoot(pCur); 009368 if( rc && rc!=SQLITE_EMPTY ) return rc; 009369 } 009370 009371 assert( cursorOwnsBtShared(pCur) ); 009372 assert( (pCur->curFlags & BTCF_WriteFlag)!=0 009373 && p->pBt->inTransaction==TRANS_WRITE 009374 && (p->pBt->btsFlags & BTS_READ_ONLY)==0 ); 009375 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 009376 009377 /* Assert that the caller has been consistent. If this cursor was opened 009378 ** expecting an index b-tree, then the caller should be inserting blob 009379 ** keys with no associated data. If the cursor was opened expecting an 009380 ** intkey table, the caller should be inserting integer keys with a 009381 ** blob of associated data. */ 009382 assert( (flags & BTREE_PREFORMAT) || (pX->pKey==0)==(pCur->pKeyInfo==0) ); 009383 009384 if( pCur->pKeyInfo==0 ){ 009385 assert( pX->pKey==0 ); 009386 /* If this is an insert into a table b-tree, invalidate any incrblob 009387 ** cursors open on the row being replaced */ 009388 if( p->hasIncrblobCur ){ 009389 invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0); 009390 } 009391 009392 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 009393 ** to a row with the same key as the new entry being inserted. 009394 */ 009395 #ifdef SQLITE_DEBUG 009396 if( flags & BTREE_SAVEPOSITION ){ 009397 assert( pCur->curFlags & BTCF_ValidNKey ); 009398 assert( pX->nKey==pCur->info.nKey ); 009399 assert( loc==0 ); 009400 } 009401 #endif 009402 009403 /* On the other hand, BTREE_SAVEPOSITION==0 does not imply 009404 ** that the cursor is not pointing to a row to be overwritten. 009405 ** So do a complete check. 009406 */ 009407 if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){ 009408 /* The cursor is pointing to the entry that is to be 009409 ** overwritten */ 009410 assert( pX->nData>=0 && pX->nZero>=0 ); 009411 if( pCur->info.nSize!=0 009412 && pCur->info.nPayload==(u32)pX->nData+pX->nZero 009413 ){ 009414 /* New entry is the same size as the old. Do an overwrite */ 009415 return btreeOverwriteCell(pCur, pX); 009416 } 009417 assert( loc==0 ); 009418 }else if( loc==0 ){ 009419 /* The cursor is *not* pointing to the cell to be overwritten, nor 009420 ** to an adjacent cell. Move the cursor so that it is pointing either 009421 ** to the cell to be overwritten or an adjacent cell. 009422 */ 009423 rc = sqlite3BtreeTableMoveto(pCur, pX->nKey, 009424 (flags & BTREE_APPEND)!=0, &loc); 009425 if( rc ) return rc; 009426 } 009427 }else{ 009428 /* This is an index or a WITHOUT ROWID table */ 009429 009430 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 009431 ** to a row with the same key as the new entry being inserted. 009432 */ 009433 assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 ); 009434 009435 /* If the cursor is not already pointing either to the cell to be 009436 ** overwritten, or if a new cell is being inserted, if the cursor is 009437 ** not pointing to an immediately adjacent cell, then move the cursor 009438 ** so that it does. 009439 */ 009440 if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){ 009441 if( pX->nMem ){ 009442 UnpackedRecord r; 009443 r.pKeyInfo = pCur->pKeyInfo; 009444 r.aMem = pX->aMem; 009445 r.nField = pX->nMem; 009446 r.default_rc = 0; 009447 r.eqSeen = 0; 009448 rc = sqlite3BtreeIndexMoveto(pCur, &r, &loc); 009449 }else{ 009450 rc = btreeMoveto(pCur, pX->pKey, pX->nKey, 009451 (flags & BTREE_APPEND)!=0, &loc); 009452 } 009453 if( rc ) return rc; 009454 } 009455 009456 /* If the cursor is currently pointing to an entry to be overwritten 009457 ** and the new content is the same as as the old, then use the 009458 ** overwrite optimization. 009459 */ 009460 if( loc==0 ){ 009461 getCellInfo(pCur); 009462 if( pCur->info.nKey==pX->nKey ){ 009463 BtreePayload x2; 009464 x2.pData = pX->pKey; 009465 x2.nData = pX->nKey; 009466 x2.nZero = 0; 009467 return btreeOverwriteCell(pCur, &x2); 009468 } 009469 } 009470 } 009471 assert( pCur->eState==CURSOR_VALID 009472 || (pCur->eState==CURSOR_INVALID && loc) || CORRUPT_DB ); 009473 009474 pPage = pCur->pPage; 009475 assert( pPage->intKey || pX->nKey>=0 || (flags & BTREE_PREFORMAT) ); 009476 assert( pPage->leaf || !pPage->intKey ); 009477 if( pPage->nFree<0 ){ 009478 if( NEVER(pCur->eState>CURSOR_INVALID) ){ 009479 /* ^^^^^--- due to the moveToRoot() call above */ 009480 rc = SQLITE_CORRUPT_PAGE(pPage); 009481 }else{ 009482 rc = btreeComputeFreeSpace(pPage); 009483 } 009484 if( rc ) return rc; 009485 } 009486 009487 TRACE(("INSERT: table=%u nkey=%lld ndata=%u page=%u %s\n", 009488 pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno, 009489 loc==0 ? "overwrite" : "new entry")); 009490 assert( pPage->isInit || CORRUPT_DB ); 009491 newCell = p->pBt->pTmpSpace; 009492 assert( newCell!=0 ); 009493 assert( BTREE_PREFORMAT==OPFLAG_PREFORMAT ); 009494 if( flags & BTREE_PREFORMAT ){ 009495 rc = SQLITE_OK; 009496 szNew = p->pBt->nPreformatSize; 009497 if( szNew<4 ){ 009498 szNew = 4; 009499 newCell[3] = 0; 009500 } 009501 if( ISAUTOVACUUM(p->pBt) && szNew>pPage->maxLocal ){ 009502 CellInfo info; 009503 pPage->xParseCell(pPage, newCell, &info); 009504 if( info.nPayload!=info.nLocal ){ 009505 Pgno ovfl = get4byte(&newCell[szNew-4]); 009506 ptrmapPut(p->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, &rc); 009507 if( NEVER(rc) ) goto end_insert; 009508 } 009509 } 009510 }else{ 009511 rc = fillInCell(pPage, newCell, pX, &szNew); 009512 if( rc ) goto end_insert; 009513 } 009514 assert( szNew==pPage->xCellSize(pPage, newCell) ); 009515 assert( szNew <= MX_CELL_SIZE(p->pBt) ); 009516 idx = pCur->ix; 009517 pCur->info.nSize = 0; 009518 if( loc==0 ){ 009519 CellInfo info; 009520 assert( idx>=0 ); 009521 if( idx>=pPage->nCell ){ 009522 return SQLITE_CORRUPT_PAGE(pPage); 009523 } 009524 rc = sqlite3PagerWrite(pPage->pDbPage); 009525 if( rc ){ 009526 goto end_insert; 009527 } 009528 oldCell = findCell(pPage, idx); 009529 if( !pPage->leaf ){ 009530 memcpy(newCell, oldCell, 4); 009531 } 009532 BTREE_CLEAR_CELL(rc, pPage, oldCell, info); 009533 testcase( pCur->curFlags & BTCF_ValidOvfl ); 009534 invalidateOverflowCache(pCur); 009535 if( info.nSize==szNew && info.nLocal==info.nPayload 009536 && (!ISAUTOVACUUM(p->pBt) || szNew<pPage->minLocal) 009537 ){ 009538 /* Overwrite the old cell with the new if they are the same size. 009539 ** We could also try to do this if the old cell is smaller, then add 009540 ** the leftover space to the free list. But experiments show that 009541 ** doing that is no faster then skipping this optimization and just 009542 ** calling dropCell() and insertCell(). 009543 ** 009544 ** This optimization cannot be used on an autovacuum database if the 009545 ** new entry uses overflow pages, as the insertCell() call below is 009546 ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry. */ 009547 assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */ 009548 if( oldCell < pPage->aData+pPage->hdrOffset+10 ){ 009549 return SQLITE_CORRUPT_PAGE(pPage); 009550 } 009551 if( oldCell+szNew > pPage->aDataEnd ){ 009552 return SQLITE_CORRUPT_PAGE(pPage); 009553 } 009554 memcpy(oldCell, newCell, szNew); 009555 return SQLITE_OK; 009556 } 009557 dropCell(pPage, idx, info.nSize, &rc); 009558 if( rc ) goto end_insert; 009559 }else if( loc<0 && pPage->nCell>0 ){ 009560 assert( pPage->leaf ); 009561 idx = ++pCur->ix; 009562 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 009563 }else{ 009564 assert( pPage->leaf ); 009565 } 009566 rc = insertCellFast(pPage, idx, newCell, szNew); 009567 assert( pPage->nOverflow==0 || rc==SQLITE_OK ); 009568 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); 009569 009570 /* If no error has occurred and pPage has an overflow cell, call balance() 009571 ** to redistribute the cells within the tree. Since balance() may move 009572 ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey 009573 ** variables. 009574 ** 009575 ** Previous versions of SQLite called moveToRoot() to move the cursor 009576 ** back to the root page as balance() used to invalidate the contents 009577 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that, 009578 ** set the cursor state to "invalid". This makes common insert operations 009579 ** slightly faster. 009580 ** 009581 ** There is a subtle but important optimization here too. When inserting 009582 ** multiple records into an intkey b-tree using a single cursor (as can 009583 ** happen while processing an "INSERT INTO ... SELECT" statement), it 009584 ** is advantageous to leave the cursor pointing to the last entry in 009585 ** the b-tree if possible. If the cursor is left pointing to the last 009586 ** entry in the table, and the next row inserted has an integer key 009587 ** larger than the largest existing key, it is possible to insert the 009588 ** row without seeking the cursor. This can be a big performance boost. 009589 */ 009590 if( pPage->nOverflow ){ 009591 assert( rc==SQLITE_OK ); 009592 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 009593 rc = balance(pCur); 009594 009595 /* Must make sure nOverflow is reset to zero even if the balance() 009596 ** fails. Internal data structure corruption will result otherwise. 009597 ** Also, set the cursor state to invalid. This stops saveCursorPosition() 009598 ** from trying to save the current position of the cursor. */ 009599 pCur->pPage->nOverflow = 0; 009600 pCur->eState = CURSOR_INVALID; 009601 if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){ 009602 btreeReleaseAllCursorPages(pCur); 009603 if( pCur->pKeyInfo ){ 009604 assert( pCur->pKey==0 ); 009605 pCur->pKey = sqlite3Malloc( pX->nKey ); 009606 if( pCur->pKey==0 ){ 009607 rc = SQLITE_NOMEM; 009608 }else{ 009609 memcpy(pCur->pKey, pX->pKey, pX->nKey); 009610 } 009611 } 009612 pCur->eState = CURSOR_REQUIRESEEK; 009613 pCur->nKey = pX->nKey; 009614 } 009615 } 009616 assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 ); 009617 009618 end_insert: 009619 return rc; 009620 } 009621 009622 /* 009623 ** This function is used as part of copying the current row from cursor 009624 ** pSrc into cursor pDest. If the cursors are open on intkey tables, then 009625 ** parameter iKey is used as the rowid value when the record is copied 009626 ** into pDest. Otherwise, the record is copied verbatim. 009627 ** 009628 ** This function does not actually write the new value to cursor pDest. 009629 ** Instead, it creates and populates any required overflow pages and 009630 ** writes the data for the new cell into the BtShared.pTmpSpace buffer 009631 ** for the destination database. The size of the cell, in bytes, is left 009632 ** in BtShared.nPreformatSize. The caller completes the insertion by 009633 ** calling sqlite3BtreeInsert() with the BTREE_PREFORMAT flag specified. 009634 ** 009635 ** SQLITE_OK is returned if successful, or an SQLite error code otherwise. 009636 */ 009637 int sqlite3BtreeTransferRow(BtCursor *pDest, BtCursor *pSrc, i64 iKey){ 009638 BtShared *pBt = pDest->pBt; 009639 u8 *aOut = pBt->pTmpSpace; /* Pointer to next output buffer */ 009640 const u8 *aIn; /* Pointer to next input buffer */ 009641 u32 nIn; /* Size of input buffer aIn[] */ 009642 u32 nRem; /* Bytes of data still to copy */ 009643 009644 getCellInfo(pSrc); 009645 if( pSrc->info.nPayload<0x80 ){ 009646 *(aOut++) = pSrc->info.nPayload; 009647 }else{ 009648 aOut += sqlite3PutVarint(aOut, pSrc->info.nPayload); 009649 } 009650 if( pDest->pKeyInfo==0 ) aOut += putVarint(aOut, iKey); 009651 nIn = pSrc->info.nLocal; 009652 aIn = pSrc->info.pPayload; 009653 if( aIn+nIn>pSrc->pPage->aDataEnd ){ 009654 return SQLITE_CORRUPT_PAGE(pSrc->pPage); 009655 } 009656 nRem = pSrc->info.nPayload; 009657 if( nIn==nRem && nIn<pDest->pPage->maxLocal ){ 009658 memcpy(aOut, aIn, nIn); 009659 pBt->nPreformatSize = nIn + (aOut - pBt->pTmpSpace); 009660 return SQLITE_OK; 009661 }else{ 009662 int rc = SQLITE_OK; 009663 Pager *pSrcPager = pSrc->pBt->pPager; 009664 u8 *pPgnoOut = 0; 009665 Pgno ovflIn = 0; 009666 DbPage *pPageIn = 0; 009667 MemPage *pPageOut = 0; 009668 u32 nOut; /* Size of output buffer aOut[] */ 009669 009670 nOut = btreePayloadToLocal(pDest->pPage, pSrc->info.nPayload); 009671 pBt->nPreformatSize = nOut + (aOut - pBt->pTmpSpace); 009672 if( nOut<pSrc->info.nPayload ){ 009673 pPgnoOut = &aOut[nOut]; 009674 pBt->nPreformatSize += 4; 009675 } 009676 009677 if( nRem>nIn ){ 009678 if( aIn+nIn+4>pSrc->pPage->aDataEnd ){ 009679 return SQLITE_CORRUPT_PAGE(pSrc->pPage); 009680 } 009681 ovflIn = get4byte(&pSrc->info.pPayload[nIn]); 009682 } 009683 009684 do { 009685 nRem -= nOut; 009686 do{ 009687 assert( nOut>0 ); 009688 if( nIn>0 ){ 009689 int nCopy = MIN(nOut, nIn); 009690 memcpy(aOut, aIn, nCopy); 009691 nOut -= nCopy; 009692 nIn -= nCopy; 009693 aOut += nCopy; 009694 aIn += nCopy; 009695 } 009696 if( nOut>0 ){ 009697 sqlite3PagerUnref(pPageIn); 009698 pPageIn = 0; 009699 rc = sqlite3PagerGet(pSrcPager, ovflIn, &pPageIn, PAGER_GET_READONLY); 009700 if( rc==SQLITE_OK ){ 009701 aIn = (const u8*)sqlite3PagerGetData(pPageIn); 009702 ovflIn = get4byte(aIn); 009703 aIn += 4; 009704 nIn = pSrc->pBt->usableSize - 4; 009705 } 009706 } 009707 }while( rc==SQLITE_OK && nOut>0 ); 009708 009709 if( rc==SQLITE_OK && nRem>0 && ALWAYS(pPgnoOut) ){ 009710 Pgno pgnoNew; 009711 MemPage *pNew = 0; 009712 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 009713 put4byte(pPgnoOut, pgnoNew); 009714 if( ISAUTOVACUUM(pBt) && pPageOut ){ 009715 ptrmapPut(pBt, pgnoNew, PTRMAP_OVERFLOW2, pPageOut->pgno, &rc); 009716 } 009717 releasePage(pPageOut); 009718 pPageOut = pNew; 009719 if( pPageOut ){ 009720 pPgnoOut = pPageOut->aData; 009721 put4byte(pPgnoOut, 0); 009722 aOut = &pPgnoOut[4]; 009723 nOut = MIN(pBt->usableSize - 4, nRem); 009724 } 009725 } 009726 }while( nRem>0 && rc==SQLITE_OK ); 009727 009728 releasePage(pPageOut); 009729 sqlite3PagerUnref(pPageIn); 009730 return rc; 009731 } 009732 } 009733 009734 /* 009735 ** Delete the entry that the cursor is pointing to. 009736 ** 009737 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then 009738 ** the cursor is left pointing at an arbitrary location after the delete. 009739 ** But if that bit is set, then the cursor is left in a state such that 009740 ** the next call to BtreeNext() or BtreePrev() moves it to the same row 009741 ** as it would have been on if the call to BtreeDelete() had been omitted. 009742 ** 009743 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes 009744 ** associated with a single table entry and its indexes. Only one of those 009745 ** deletes is considered the "primary" delete. The primary delete occurs 009746 ** on a cursor that is not a BTREE_FORDELETE cursor. All but one delete 009747 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag. 009748 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation, 009749 ** but which might be used by alternative storage engines. 009750 */ 009751 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ 009752 Btree *p = pCur->pBtree; 009753 BtShared *pBt = p->pBt; 009754 int rc; /* Return code */ 009755 MemPage *pPage; /* Page to delete cell from */ 009756 unsigned char *pCell; /* Pointer to cell to delete */ 009757 int iCellIdx; /* Index of cell to delete */ 009758 int iCellDepth; /* Depth of node containing pCell */ 009759 CellInfo info; /* Size of the cell being deleted */ 009760 u8 bPreserve; /* Keep cursor valid. 2 for CURSOR_SKIPNEXT */ 009761 009762 assert( cursorOwnsBtShared(pCur) ); 009763 assert( pBt->inTransaction==TRANS_WRITE ); 009764 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 009765 assert( pCur->curFlags & BTCF_WriteFlag ); 009766 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 009767 assert( !hasReadConflicts(p, pCur->pgnoRoot) ); 009768 assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 ); 009769 if( pCur->eState!=CURSOR_VALID ){ 009770 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 009771 rc = btreeRestoreCursorPosition(pCur); 009772 assert( rc!=SQLITE_OK || CORRUPT_DB || pCur->eState==CURSOR_VALID ); 009773 if( rc || pCur->eState!=CURSOR_VALID ) return rc; 009774 }else{ 009775 return SQLITE_CORRUPT_PGNO(pCur->pgnoRoot); 009776 } 009777 } 009778 assert( pCur->eState==CURSOR_VALID ); 009779 009780 iCellDepth = pCur->iPage; 009781 iCellIdx = pCur->ix; 009782 pPage = pCur->pPage; 009783 if( pPage->nCell<=iCellIdx ){ 009784 return SQLITE_CORRUPT_PAGE(pPage); 009785 } 009786 pCell = findCell(pPage, iCellIdx); 009787 if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ){ 009788 return SQLITE_CORRUPT_PAGE(pPage); 009789 } 009790 if( pCell<&pPage->aCellIdx[pPage->nCell] ){ 009791 return SQLITE_CORRUPT_PAGE(pPage); 009792 } 009793 009794 /* If the BTREE_SAVEPOSITION bit is on, then the cursor position must 009795 ** be preserved following this delete operation. If the current delete 009796 ** will cause a b-tree rebalance, then this is done by saving the cursor 009797 ** key and leaving the cursor in CURSOR_REQUIRESEEK state before 009798 ** returning. 009799 ** 009800 ** If the current delete will not cause a rebalance, then the cursor 009801 ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately 009802 ** before or after the deleted entry. 009803 ** 009804 ** The bPreserve value records which path is required: 009805 ** 009806 ** bPreserve==0 Not necessary to save the cursor position 009807 ** bPreserve==1 Use CURSOR_REQUIRESEEK to save the cursor position 009808 ** bPreserve==2 Cursor won't move. Set CURSOR_SKIPNEXT. 009809 */ 009810 bPreserve = (flags & BTREE_SAVEPOSITION)!=0; 009811 if( bPreserve ){ 009812 if( !pPage->leaf 009813 || (pPage->nFree+pPage->xCellSize(pPage,pCell)+2) > 009814 (int)(pBt->usableSize*2/3) 009815 || pPage->nCell==1 /* See dbfuzz001.test for a test case */ 009816 ){ 009817 /* A b-tree rebalance will be required after deleting this entry. 009818 ** Save the cursor key. */ 009819 rc = saveCursorKey(pCur); 009820 if( rc ) return rc; 009821 }else{ 009822 bPreserve = 2; 009823 } 009824 } 009825 009826 /* If the page containing the entry to delete is not a leaf page, move 009827 ** the cursor to the largest entry in the tree that is smaller than 009828 ** the entry being deleted. This cell will replace the cell being deleted 009829 ** from the internal node. The 'previous' entry is used for this instead 009830 ** of the 'next' entry, as the previous entry is always a part of the 009831 ** sub-tree headed by the child page of the cell being deleted. This makes 009832 ** balancing the tree following the delete operation easier. */ 009833 if( !pPage->leaf ){ 009834 rc = sqlite3BtreePrevious(pCur, 0); 009835 assert( rc!=SQLITE_DONE ); 009836 if( rc ) return rc; 009837 } 009838 009839 /* Save the positions of any other cursors open on this table before 009840 ** making any modifications. */ 009841 if( pCur->curFlags & BTCF_Multiple ){ 009842 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 009843 if( rc ) return rc; 009844 } 009845 009846 /* If this is a delete operation to remove a row from a table b-tree, 009847 ** invalidate any incrblob cursors open on the row being deleted. */ 009848 if( pCur->pKeyInfo==0 && p->hasIncrblobCur ){ 009849 invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0); 009850 } 009851 009852 /* Make the page containing the entry to be deleted writable. Then free any 009853 ** overflow pages associated with the entry and finally remove the cell 009854 ** itself from within the page. */ 009855 rc = sqlite3PagerWrite(pPage->pDbPage); 009856 if( rc ) return rc; 009857 BTREE_CLEAR_CELL(rc, pPage, pCell, info); 009858 dropCell(pPage, iCellIdx, info.nSize, &rc); 009859 if( rc ) return rc; 009860 009861 /* If the cell deleted was not located on a leaf page, then the cursor 009862 ** is currently pointing to the largest entry in the sub-tree headed 009863 ** by the child-page of the cell that was just deleted from an internal 009864 ** node. The cell from the leaf node needs to be moved to the internal 009865 ** node to replace the deleted cell. */ 009866 if( !pPage->leaf ){ 009867 MemPage *pLeaf = pCur->pPage; 009868 int nCell; 009869 Pgno n; 009870 unsigned char *pTmp; 009871 009872 if( pLeaf->nFree<0 ){ 009873 rc = btreeComputeFreeSpace(pLeaf); 009874 if( rc ) return rc; 009875 } 009876 if( iCellDepth<pCur->iPage-1 ){ 009877 n = pCur->apPage[iCellDepth+1]->pgno; 009878 }else{ 009879 n = pCur->pPage->pgno; 009880 } 009881 pCell = findCell(pLeaf, pLeaf->nCell-1); 009882 if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_PAGE(pLeaf); 009883 nCell = pLeaf->xCellSize(pLeaf, pCell); 009884 assert( MX_CELL_SIZE(pBt) >= nCell ); 009885 pTmp = pBt->pTmpSpace; 009886 assert( pTmp!=0 ); 009887 rc = sqlite3PagerWrite(pLeaf->pDbPage); 009888 if( rc==SQLITE_OK ){ 009889 rc = insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n); 009890 } 009891 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); 009892 if( rc ) return rc; 009893 } 009894 009895 /* Balance the tree. If the entry deleted was located on a leaf page, 009896 ** then the cursor still points to that page. In this case the first 009897 ** call to balance() repairs the tree, and the if(...) condition is 009898 ** never true. 009899 ** 009900 ** Otherwise, if the entry deleted was on an internal node page, then 009901 ** pCur is pointing to the leaf page from which a cell was removed to 009902 ** replace the cell deleted from the internal node. This is slightly 009903 ** tricky as the leaf node may be underfull, and the internal node may 009904 ** be either under or overfull. In this case run the balancing algorithm 009905 ** on the leaf node first. If the balance proceeds far enough up the 009906 ** tree that we can be sure that any problem in the internal node has 009907 ** been corrected, so be it. Otherwise, after balancing the leaf node, 009908 ** walk the cursor up the tree to the internal node and balance it as 009909 ** well. */ 009910 assert( pCur->pPage->nOverflow==0 ); 009911 assert( pCur->pPage->nFree>=0 ); 009912 if( pCur->pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){ 009913 /* Optimization: If the free space is less than 2/3rds of the page, 009914 ** then balance() will always be a no-op. No need to invoke it. */ 009915 rc = SQLITE_OK; 009916 }else{ 009917 rc = balance(pCur); 009918 } 009919 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ 009920 releasePageNotNull(pCur->pPage); 009921 pCur->iPage--; 009922 while( pCur->iPage>iCellDepth ){ 009923 releasePage(pCur->apPage[pCur->iPage--]); 009924 } 009925 pCur->pPage = pCur->apPage[pCur->iPage]; 009926 rc = balance(pCur); 009927 } 009928 009929 if( rc==SQLITE_OK ){ 009930 if( bPreserve>1 ){ 009931 assert( (pCur->iPage==iCellDepth || CORRUPT_DB) ); 009932 assert( pPage==pCur->pPage || CORRUPT_DB ); 009933 assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell ); 009934 pCur->eState = CURSOR_SKIPNEXT; 009935 if( iCellIdx>=pPage->nCell ){ 009936 pCur->skipNext = -1; 009937 pCur->ix = pPage->nCell-1; 009938 }else{ 009939 pCur->skipNext = 1; 009940 } 009941 }else{ 009942 rc = moveToRoot(pCur); 009943 if( bPreserve ){ 009944 btreeReleaseAllCursorPages(pCur); 009945 pCur->eState = CURSOR_REQUIRESEEK; 009946 } 009947 if( rc==SQLITE_EMPTY ) rc = SQLITE_OK; 009948 } 009949 } 009950 return rc; 009951 } 009952 009953 /* 009954 ** Create a new BTree table. Write into *piTable the page 009955 ** number for the root page of the new table. 009956 ** 009957 ** The type of type is determined by the flags parameter. Only the 009958 ** following values of flags are currently in use. Other values for 009959 ** flags might not work: 009960 ** 009961 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 009962 ** BTREE_ZERODATA Used for SQL indices 009963 */ 009964 static int btreeCreateTable(Btree *p, Pgno *piTable, int createTabFlags){ 009965 BtShared *pBt = p->pBt; 009966 MemPage *pRoot; 009967 Pgno pgnoRoot; 009968 int rc; 009969 int ptfFlags; /* Page-type flags for the root page of new table */ 009970 009971 assert( sqlite3BtreeHoldsMutex(p) ); 009972 assert( pBt->inTransaction==TRANS_WRITE ); 009973 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 009974 009975 #ifdef SQLITE_OMIT_AUTOVACUUM 009976 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 009977 if( rc ){ 009978 return rc; 009979 } 009980 #else 009981 if( pBt->autoVacuum ){ 009982 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 009983 MemPage *pPageMove; /* The page to move to. */ 009984 009985 /* Creating a new table may probably require moving an existing database 009986 ** to make room for the new tables root page. In case this page turns 009987 ** out to be an overflow page, delete all overflow page-map caches 009988 ** held by open cursors. 009989 */ 009990 invalidateAllOverflowCache(pBt); 009991 009992 /* Read the value of meta[3] from the database to determine where the 009993 ** root page of the new table should go. meta[3] is the largest root-page 009994 ** created so far, so the new root-page is (meta[3]+1). 009995 */ 009996 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot); 009997 if( pgnoRoot>btreePagecount(pBt) ){ 009998 return SQLITE_CORRUPT_PGNO(pgnoRoot); 009999 } 010000 pgnoRoot++; 010001 010002 /* The new root-page may not be allocated on a pointer-map page, or the 010003 ** PENDING_BYTE page. 010004 */ 010005 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 010006 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 010007 pgnoRoot++; 010008 } 010009 assert( pgnoRoot>=3 ); 010010 010011 /* Allocate a page. The page that currently resides at pgnoRoot will 010012 ** be moved to the allocated page (unless the allocated page happens 010013 ** to reside at pgnoRoot). 010014 */ 010015 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT); 010016 if( rc!=SQLITE_OK ){ 010017 return rc; 010018 } 010019 010020 if( pgnoMove!=pgnoRoot ){ 010021 /* pgnoRoot is the page that will be used for the root-page of 010022 ** the new table (assuming an error did not occur). But we were 010023 ** allocated pgnoMove. If required (i.e. if it was not allocated 010024 ** by extending the file), the current page at position pgnoMove 010025 ** is already journaled. 010026 */ 010027 u8 eType = 0; 010028 Pgno iPtrPage = 0; 010029 010030 /* Save the positions of any open cursors. This is required in 010031 ** case they are holding a reference to an xFetch reference 010032 ** corresponding to page pgnoRoot. */ 010033 rc = saveAllCursors(pBt, 0, 0); 010034 releasePage(pPageMove); 010035 if( rc!=SQLITE_OK ){ 010036 return rc; 010037 } 010038 010039 /* Move the page currently at pgnoRoot to pgnoMove. */ 010040 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 010041 if( rc!=SQLITE_OK ){ 010042 return rc; 010043 } 010044 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 010045 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 010046 rc = SQLITE_CORRUPT_PGNO(pgnoRoot); 010047 } 010048 if( rc!=SQLITE_OK ){ 010049 releasePage(pRoot); 010050 return rc; 010051 } 010052 assert( eType!=PTRMAP_ROOTPAGE ); 010053 assert( eType!=PTRMAP_FREEPAGE ); 010054 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); 010055 releasePage(pRoot); 010056 010057 /* Obtain the page at pgnoRoot */ 010058 if( rc!=SQLITE_OK ){ 010059 return rc; 010060 } 010061 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 010062 if( rc!=SQLITE_OK ){ 010063 return rc; 010064 } 010065 rc = sqlite3PagerWrite(pRoot->pDbPage); 010066 if( rc!=SQLITE_OK ){ 010067 releasePage(pRoot); 010068 return rc; 010069 } 010070 }else{ 010071 pRoot = pPageMove; 010072 } 010073 010074 /* Update the pointer-map and meta-data with the new root-page number. */ 010075 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc); 010076 if( rc ){ 010077 releasePage(pRoot); 010078 return rc; 010079 } 010080 010081 /* When the new root page was allocated, page 1 was made writable in 010082 ** order either to increase the database filesize, or to decrement the 010083 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail. 010084 */ 010085 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) ); 010086 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 010087 if( NEVER(rc) ){ 010088 releasePage(pRoot); 010089 return rc; 010090 } 010091 010092 }else{ 010093 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 010094 if( rc ) return rc; 010095 } 010096 #endif 010097 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 010098 if( createTabFlags & BTREE_INTKEY ){ 010099 ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF; 010100 }else{ 010101 ptfFlags = PTF_ZERODATA | PTF_LEAF; 010102 } 010103 zeroPage(pRoot, ptfFlags); 010104 sqlite3PagerUnref(pRoot->pDbPage); 010105 assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 ); 010106 *piTable = pgnoRoot; 010107 return SQLITE_OK; 010108 } 010109 int sqlite3BtreeCreateTable(Btree *p, Pgno *piTable, int flags){ 010110 int rc; 010111 sqlite3BtreeEnter(p); 010112 rc = btreeCreateTable(p, piTable, flags); 010113 sqlite3BtreeLeave(p); 010114 return rc; 010115 } 010116 010117 /* 010118 ** Erase the given database page and all its children. Return 010119 ** the page to the freelist. 010120 */ 010121 static int clearDatabasePage( 010122 BtShared *pBt, /* The BTree that contains the table */ 010123 Pgno pgno, /* Page number to clear */ 010124 int freePageFlag, /* Deallocate page if true */ 010125 i64 *pnChange /* Add number of Cells freed to this counter */ 010126 ){ 010127 MemPage *pPage; 010128 int rc; 010129 unsigned char *pCell; 010130 int i; 010131 int hdr; 010132 CellInfo info; 010133 010134 assert( sqlite3_mutex_held(pBt->mutex) ); 010135 if( pgno>btreePagecount(pBt) ){ 010136 return SQLITE_CORRUPT_PGNO(pgno); 010137 } 010138 rc = getAndInitPage(pBt, pgno, &pPage, 0); 010139 if( rc ) return rc; 010140 if( (pBt->openFlags & BTREE_SINGLE)==0 010141 && sqlite3PagerPageRefcount(pPage->pDbPage) != (1 + (pgno==1)) 010142 ){ 010143 rc = SQLITE_CORRUPT_PAGE(pPage); 010144 goto cleardatabasepage_out; 010145 } 010146 hdr = pPage->hdrOffset; 010147 for(i=0; i<pPage->nCell; i++){ 010148 pCell = findCell(pPage, i); 010149 if( !pPage->leaf ){ 010150 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); 010151 if( rc ) goto cleardatabasepage_out; 010152 } 010153 BTREE_CLEAR_CELL(rc, pPage, pCell, info); 010154 if( rc ) goto cleardatabasepage_out; 010155 } 010156 if( !pPage->leaf ){ 010157 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange); 010158 if( rc ) goto cleardatabasepage_out; 010159 if( pPage->intKey ) pnChange = 0; 010160 } 010161 if( pnChange ){ 010162 testcase( !pPage->intKey ); 010163 *pnChange += pPage->nCell; 010164 } 010165 if( freePageFlag ){ 010166 freePage(pPage, &rc); 010167 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 010168 zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF); 010169 } 010170 010171 cleardatabasepage_out: 010172 releasePage(pPage); 010173 return rc; 010174 } 010175 010176 /* 010177 ** Delete all information from a single table in the database. iTable is 010178 ** the page number of the root of the table. After this routine returns, 010179 ** the root page is empty, but still exists. 010180 ** 010181 ** This routine will fail with SQLITE_LOCKED if there are any open 010182 ** read cursors on the table. Open write cursors are moved to the 010183 ** root of the table. 010184 ** 010185 ** If pnChange is not NULL, then the integer value pointed to by pnChange 010186 ** is incremented by the number of entries in the table. 010187 */ 010188 int sqlite3BtreeClearTable(Btree *p, int iTable, i64 *pnChange){ 010189 int rc; 010190 BtShared *pBt = p->pBt; 010191 sqlite3BtreeEnter(p); 010192 assert( p->inTrans==TRANS_WRITE ); 010193 010194 rc = saveAllCursors(pBt, (Pgno)iTable, 0); 010195 010196 if( SQLITE_OK==rc ){ 010197 /* Invalidate all incrblob cursors open on table iTable (assuming iTable 010198 ** is the root of a table b-tree - if it is not, the following call is 010199 ** a no-op). */ 010200 if( p->hasIncrblobCur ){ 010201 invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1); 010202 } 010203 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); 010204 } 010205 sqlite3BtreeLeave(p); 010206 return rc; 010207 } 010208 010209 /* 010210 ** Delete all information from the single table that pCur is open on. 010211 ** 010212 ** This routine only work for pCur on an ephemeral table. 010213 */ 010214 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){ 010215 return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0); 010216 } 010217 010218 /* 010219 ** Erase all information in a table and add the root of the table to 010220 ** the freelist. Except, the root of the principle table (the one on 010221 ** page 1) is never added to the freelist. 010222 ** 010223 ** This routine will fail with SQLITE_LOCKED if there are any open 010224 ** cursors on the table. 010225 ** 010226 ** If AUTOVACUUM is enabled and the page at iTable is not the last 010227 ** root page in the database file, then the last root page 010228 ** in the database file is moved into the slot formerly occupied by 010229 ** iTable and that last slot formerly occupied by the last root page 010230 ** is added to the freelist instead of iTable. In this say, all 010231 ** root pages are kept at the beginning of the database file, which 010232 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the 010233 ** page number that used to be the last root page in the file before 010234 ** the move. If no page gets moved, *piMoved is set to 0. 010235 ** The last root page is recorded in meta[3] and the value of 010236 ** meta[3] is updated by this procedure. 010237 */ 010238 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ 010239 int rc; 010240 MemPage *pPage = 0; 010241 BtShared *pBt = p->pBt; 010242 010243 assert( sqlite3BtreeHoldsMutex(p) ); 010244 assert( p->inTrans==TRANS_WRITE ); 010245 assert( iTable>=2 ); 010246 if( iTable>btreePagecount(pBt) ){ 010247 return SQLITE_CORRUPT_PGNO(iTable); 010248 } 010249 010250 rc = sqlite3BtreeClearTable(p, iTable, 0); 010251 if( rc ) return rc; 010252 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 010253 if( NEVER(rc) ){ 010254 releasePage(pPage); 010255 return rc; 010256 } 010257 010258 *piMoved = 0; 010259 010260 #ifdef SQLITE_OMIT_AUTOVACUUM 010261 freePage(pPage, &rc); 010262 releasePage(pPage); 010263 #else 010264 if( pBt->autoVacuum ){ 010265 Pgno maxRootPgno; 010266 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno); 010267 010268 if( iTable==maxRootPgno ){ 010269 /* If the table being dropped is the table with the largest root-page 010270 ** number in the database, put the root page on the free list. 010271 */ 010272 freePage(pPage, &rc); 010273 releasePage(pPage); 010274 if( rc!=SQLITE_OK ){ 010275 return rc; 010276 } 010277 }else{ 010278 /* The table being dropped does not have the largest root-page 010279 ** number in the database. So move the page that does into the 010280 ** gap left by the deleted root-page. 010281 */ 010282 MemPage *pMove; 010283 releasePage(pPage); 010284 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 010285 if( rc!=SQLITE_OK ){ 010286 return rc; 010287 } 010288 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0); 010289 releasePage(pMove); 010290 if( rc!=SQLITE_OK ){ 010291 return rc; 010292 } 010293 pMove = 0; 010294 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 010295 freePage(pMove, &rc); 010296 releasePage(pMove); 010297 if( rc!=SQLITE_OK ){ 010298 return rc; 010299 } 010300 *piMoved = maxRootPgno; 010301 } 010302 010303 /* Set the new 'max-root-page' value in the database header. This 010304 ** is the old value less one, less one more if that happens to 010305 ** be a root-page number, less one again if that is the 010306 ** PENDING_BYTE_PAGE. 010307 */ 010308 maxRootPgno--; 010309 while( maxRootPgno==PENDING_BYTE_PAGE(pBt) 010310 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){ 010311 maxRootPgno--; 010312 } 010313 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 010314 010315 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 010316 }else{ 010317 freePage(pPage, &rc); 010318 releasePage(pPage); 010319 } 010320 #endif 010321 return rc; 010322 } 010323 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 010324 int rc; 010325 sqlite3BtreeEnter(p); 010326 rc = btreeDropTable(p, iTable, piMoved); 010327 sqlite3BtreeLeave(p); 010328 return rc; 010329 } 010330 010331 010332 /* 010333 ** This function may only be called if the b-tree connection already 010334 ** has a read or write transaction open on the database. 010335 ** 010336 ** Read the meta-information out of a database file. Meta[0] 010337 ** is the number of free pages currently in the database. Meta[1] 010338 ** through meta[15] are available for use by higher layers. Meta[0] 010339 ** is read-only, the others are read/write. 010340 ** 010341 ** The schema layer numbers meta values differently. At the schema 010342 ** layer (and the SetCookie and ReadCookie opcodes) the number of 010343 ** free pages is not visible. So Cookie[0] is the same as Meta[1]. 010344 ** 010345 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead 010346 ** of reading the value out of the header, it instead loads the "DataVersion" 010347 ** from the pager. The BTREE_DATA_VERSION value is not actually stored in the 010348 ** database file. It is a number computed by the pager. But its access 010349 ** pattern is the same as header meta values, and so it is convenient to 010350 ** read it from this routine. 010351 */ 010352 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 010353 BtShared *pBt = p->pBt; 010354 010355 sqlite3BtreeEnter(p); 010356 assert( p->inTrans>TRANS_NONE ); 010357 assert( SQLITE_OK==querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK) ); 010358 assert( pBt->pPage1 ); 010359 assert( idx>=0 && idx<=15 ); 010360 010361 if( idx==BTREE_DATA_VERSION ){ 010362 *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iBDataVersion; 010363 }else{ 010364 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); 010365 } 010366 010367 /* If auto-vacuum is disabled in this build and this is an auto-vacuum 010368 ** database, mark the database as read-only. */ 010369 #ifdef SQLITE_OMIT_AUTOVACUUM 010370 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){ 010371 pBt->btsFlags |= BTS_READ_ONLY; 010372 } 010373 #endif 010374 010375 sqlite3BtreeLeave(p); 010376 } 010377 010378 /* 010379 ** Write meta-information back into the database. Meta[0] is 010380 ** read-only and may not be written. 010381 */ 010382 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 010383 BtShared *pBt = p->pBt; 010384 unsigned char *pP1; 010385 int rc; 010386 assert( idx>=1 && idx<=15 ); 010387 sqlite3BtreeEnter(p); 010388 assert( p->inTrans==TRANS_WRITE ); 010389 assert( pBt->pPage1!=0 ); 010390 pP1 = pBt->pPage1->aData; 010391 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 010392 if( rc==SQLITE_OK ){ 010393 put4byte(&pP1[36 + idx*4], iMeta); 010394 #ifndef SQLITE_OMIT_AUTOVACUUM 010395 if( idx==BTREE_INCR_VACUUM ){ 010396 assert( pBt->autoVacuum || iMeta==0 ); 010397 assert( iMeta==0 || iMeta==1 ); 010398 pBt->incrVacuum = (u8)iMeta; 010399 } 010400 #endif 010401 } 010402 sqlite3BtreeLeave(p); 010403 return rc; 010404 } 010405 010406 /* 010407 ** The first argument, pCur, is a cursor opened on some b-tree. Count the 010408 ** number of entries in the b-tree and write the result to *pnEntry. 010409 ** 010410 ** SQLITE_OK is returned if the operation is successfully executed. 010411 ** Otherwise, if an error is encountered (i.e. an IO error or database 010412 ** corruption) an SQLite error code is returned. 010413 */ 010414 int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){ 010415 i64 nEntry = 0; /* Value to return in *pnEntry */ 010416 int rc; /* Return code */ 010417 010418 rc = moveToRoot(pCur); 010419 if( rc==SQLITE_EMPTY ){ 010420 *pnEntry = 0; 010421 return SQLITE_OK; 010422 } 010423 010424 /* Unless an error occurs, the following loop runs one iteration for each 010425 ** page in the B-Tree structure (not including overflow pages). 010426 */ 010427 while( rc==SQLITE_OK && !AtomicLoad(&db->u1.isInterrupted) ){ 010428 int iIdx; /* Index of child node in parent */ 010429 MemPage *pPage; /* Current page of the b-tree */ 010430 010431 /* If this is a leaf page or the tree is not an int-key tree, then 010432 ** this page contains countable entries. Increment the entry counter 010433 ** accordingly. 010434 */ 010435 pPage = pCur->pPage; 010436 if( pPage->leaf || !pPage->intKey ){ 010437 nEntry += pPage->nCell; 010438 } 010439 010440 /* pPage is a leaf node. This loop navigates the cursor so that it 010441 ** points to the first interior cell that it points to the parent of 010442 ** the next page in the tree that has not yet been visited. The 010443 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell 010444 ** of the page, or to the number of cells in the page if the next page 010445 ** to visit is the right-child of its parent. 010446 ** 010447 ** If all pages in the tree have been visited, return SQLITE_OK to the 010448 ** caller. 010449 */ 010450 if( pPage->leaf ){ 010451 do { 010452 if( pCur->iPage==0 ){ 010453 /* All pages of the b-tree have been visited. Return successfully. */ 010454 *pnEntry = nEntry; 010455 return moveToRoot(pCur); 010456 } 010457 moveToParent(pCur); 010458 }while ( pCur->ix>=pCur->pPage->nCell ); 010459 010460 pCur->ix++; 010461 pPage = pCur->pPage; 010462 } 010463 010464 /* Descend to the child node of the cell that the cursor currently 010465 ** points at. This is the right-child if (iIdx==pPage->nCell). 010466 */ 010467 iIdx = pCur->ix; 010468 if( iIdx==pPage->nCell ){ 010469 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 010470 }else{ 010471 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx))); 010472 } 010473 } 010474 010475 /* An error has occurred. Return an error code. */ 010476 return rc; 010477 } 010478 010479 /* 010480 ** Return the pager associated with a BTree. This routine is used for 010481 ** testing and debugging only. 010482 */ 010483 Pager *sqlite3BtreePager(Btree *p){ 010484 return p->pBt->pPager; 010485 } 010486 010487 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010488 /* 010489 ** Record an OOM error during integrity_check 010490 */ 010491 static void checkOom(IntegrityCk *pCheck){ 010492 pCheck->rc = SQLITE_NOMEM; 010493 pCheck->mxErr = 0; /* Causes integrity_check processing to stop */ 010494 if( pCheck->nErr==0 ) pCheck->nErr++; 010495 } 010496 010497 /* 010498 ** Invoke the progress handler, if appropriate. Also check for an 010499 ** interrupt. 010500 */ 010501 static void checkProgress(IntegrityCk *pCheck){ 010502 sqlite3 *db = pCheck->db; 010503 if( AtomicLoad(&db->u1.isInterrupted) ){ 010504 pCheck->rc = SQLITE_INTERRUPT; 010505 pCheck->nErr++; 010506 pCheck->mxErr = 0; 010507 } 010508 #ifndef SQLITE_OMIT_PROGRESS_CALLBACK 010509 if( db->xProgress ){ 010510 assert( db->nProgressOps>0 ); 010511 pCheck->nStep++; 010512 if( (pCheck->nStep % db->nProgressOps)==0 010513 && db->xProgress(db->pProgressArg) 010514 ){ 010515 pCheck->rc = SQLITE_INTERRUPT; 010516 pCheck->nErr++; 010517 pCheck->mxErr = 0; 010518 } 010519 } 010520 #endif 010521 } 010522 010523 /* 010524 ** Append a message to the error message string. 010525 */ 010526 static void checkAppendMsg( 010527 IntegrityCk *pCheck, 010528 const char *zFormat, 010529 ... 010530 ){ 010531 va_list ap; 010532 checkProgress(pCheck); 010533 if( !pCheck->mxErr ) return; 010534 pCheck->mxErr--; 010535 pCheck->nErr++; 010536 va_start(ap, zFormat); 010537 if( pCheck->errMsg.nChar ){ 010538 sqlite3_str_append(&pCheck->errMsg, "\n", 1); 010539 } 010540 if( pCheck->zPfx ){ 010541 sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx, 010542 pCheck->v0, pCheck->v1, pCheck->v2); 010543 } 010544 sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap); 010545 va_end(ap); 010546 if( pCheck->errMsg.accError==SQLITE_NOMEM ){ 010547 checkOom(pCheck); 010548 } 010549 } 010550 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 010551 010552 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010553 010554 /* 010555 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that 010556 ** corresponds to page iPg is already set. 010557 */ 010558 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){ 010559 assert( pCheck->aPgRef!=0 ); 010560 assert( iPg<=pCheck->nCkPage && sizeof(pCheck->aPgRef[0])==1 ); 010561 return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07))); 010562 } 010563 010564 /* 010565 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg. 010566 */ 010567 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){ 010568 assert( pCheck->aPgRef!=0 ); 010569 assert( iPg<=pCheck->nCkPage && sizeof(pCheck->aPgRef[0])==1 ); 010570 pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07)); 010571 } 010572 010573 010574 /* 010575 ** Add 1 to the reference count for page iPage. If this is the second 010576 ** reference to the page, add an error message to pCheck->zErrMsg. 010577 ** Return 1 if there are 2 or more references to the page and 0 if 010578 ** if this is the first reference to the page. 010579 ** 010580 ** Also check that the page number is in bounds. 010581 */ 010582 static int checkRef(IntegrityCk *pCheck, Pgno iPage){ 010583 if( iPage>pCheck->nCkPage || iPage==0 ){ 010584 checkAppendMsg(pCheck, "invalid page number %u", iPage); 010585 return 1; 010586 } 010587 if( getPageReferenced(pCheck, iPage) ){ 010588 checkAppendMsg(pCheck, "2nd reference to page %u", iPage); 010589 return 1; 010590 } 010591 setPageReferenced(pCheck, iPage); 010592 return 0; 010593 } 010594 010595 #ifndef SQLITE_OMIT_AUTOVACUUM 010596 /* 010597 ** Check that the entry in the pointer-map for page iChild maps to 010598 ** page iParent, pointer type ptrType. If not, append an error message 010599 ** to pCheck. 010600 */ 010601 static void checkPtrmap( 010602 IntegrityCk *pCheck, /* Integrity check context */ 010603 Pgno iChild, /* Child page number */ 010604 u8 eType, /* Expected pointer map type */ 010605 Pgno iParent /* Expected pointer map parent page number */ 010606 ){ 010607 int rc; 010608 u8 ePtrmapType; 010609 Pgno iPtrmapParent; 010610 010611 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 010612 if( rc!=SQLITE_OK ){ 010613 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) checkOom(pCheck); 010614 checkAppendMsg(pCheck, "Failed to read ptrmap key=%u", iChild); 010615 return; 010616 } 010617 010618 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 010619 checkAppendMsg(pCheck, 010620 "Bad ptr map entry key=%u expected=(%u,%u) got=(%u,%u)", 010621 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 010622 } 010623 } 010624 #endif 010625 010626 /* 010627 ** Check the integrity of the freelist or of an overflow page list. 010628 ** Verify that the number of pages on the list is N. 010629 */ 010630 static void checkList( 010631 IntegrityCk *pCheck, /* Integrity checking context */ 010632 int isFreeList, /* True for a freelist. False for overflow page list */ 010633 Pgno iPage, /* Page number for first page in the list */ 010634 u32 N /* Expected number of pages in the list */ 010635 ){ 010636 int i; 010637 u32 expected = N; 010638 int nErrAtStart = pCheck->nErr; 010639 while( iPage!=0 && pCheck->mxErr ){ 010640 DbPage *pOvflPage; 010641 unsigned char *pOvflData; 010642 if( checkRef(pCheck, iPage) ) break; 010643 N--; 010644 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){ 010645 checkAppendMsg(pCheck, "failed to get page %u", iPage); 010646 break; 010647 } 010648 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 010649 if( isFreeList ){ 010650 u32 n = (u32)get4byte(&pOvflData[4]); 010651 #ifndef SQLITE_OMIT_AUTOVACUUM 010652 if( pCheck->pBt->autoVacuum ){ 010653 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0); 010654 } 010655 #endif 010656 if( n>pCheck->pBt->usableSize/4-2 ){ 010657 checkAppendMsg(pCheck, 010658 "freelist leaf count too big on page %u", iPage); 010659 N--; 010660 }else{ 010661 for(i=0; i<(int)n; i++){ 010662 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 010663 #ifndef SQLITE_OMIT_AUTOVACUUM 010664 if( pCheck->pBt->autoVacuum ){ 010665 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0); 010666 } 010667 #endif 010668 checkRef(pCheck, iFreePage); 010669 } 010670 N -= n; 010671 } 010672 } 010673 #ifndef SQLITE_OMIT_AUTOVACUUM 010674 else{ 010675 /* If this database supports auto-vacuum and iPage is not the last 010676 ** page in this overflow list, check that the pointer-map entry for 010677 ** the following page matches iPage. 010678 */ 010679 if( pCheck->pBt->autoVacuum && N>0 ){ 010680 i = get4byte(pOvflData); 010681 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage); 010682 } 010683 } 010684 #endif 010685 iPage = get4byte(pOvflData); 010686 sqlite3PagerUnref(pOvflPage); 010687 } 010688 if( N && nErrAtStart==pCheck->nErr ){ 010689 checkAppendMsg(pCheck, 010690 "%s is %u but should be %u", 010691 isFreeList ? "size" : "overflow list length", 010692 expected-N, expected); 010693 } 010694 } 010695 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 010696 010697 /* 010698 ** An implementation of a min-heap. 010699 ** 010700 ** aHeap[0] is the number of elements on the heap. aHeap[1] is the 010701 ** root element. The daughter nodes of aHeap[N] are aHeap[N*2] 010702 ** and aHeap[N*2+1]. 010703 ** 010704 ** The heap property is this: Every node is less than or equal to both 010705 ** of its daughter nodes. A consequence of the heap property is that the 010706 ** root node aHeap[1] is always the minimum value currently in the heap. 010707 ** 010708 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto 010709 ** the heap, preserving the heap property. The btreeHeapPull() routine 010710 ** removes the root element from the heap (the minimum value in the heap) 010711 ** and then moves other nodes around as necessary to preserve the heap 010712 ** property. 010713 ** 010714 ** This heap is used for cell overlap and coverage testing. Each u32 010715 ** entry represents the span of a cell or freeblock on a btree page. 010716 ** The upper 16 bits are the index of the first byte of a range and the 010717 ** lower 16 bits are the index of the last byte of that range. 010718 */ 010719 static void btreeHeapInsert(u32 *aHeap, u32 x){ 010720 u32 j, i; 010721 assert( aHeap!=0 ); 010722 i = ++aHeap[0]; 010723 aHeap[i] = x; 010724 while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){ 010725 x = aHeap[j]; 010726 aHeap[j] = aHeap[i]; 010727 aHeap[i] = x; 010728 i = j; 010729 } 010730 } 010731 static int btreeHeapPull(u32 *aHeap, u32 *pOut){ 010732 u32 j, i, x; 010733 if( (x = aHeap[0])==0 ) return 0; 010734 *pOut = aHeap[1]; 010735 aHeap[1] = aHeap[x]; 010736 aHeap[x] = 0xffffffff; 010737 aHeap[0]--; 010738 i = 1; 010739 while( (j = i*2)<=aHeap[0] ){ 010740 if( aHeap[j]>aHeap[j+1] ) j++; 010741 if( aHeap[i]<aHeap[j] ) break; 010742 x = aHeap[i]; 010743 aHeap[i] = aHeap[j]; 010744 aHeap[j] = x; 010745 i = j; 010746 } 010747 return 1; 010748 } 010749 010750 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010751 /* 010752 ** Do various sanity checks on a single page of a tree. Return 010753 ** the tree depth. Root pages return 0. Parents of root pages 010754 ** return 1, and so forth. 010755 ** 010756 ** These checks are done: 010757 ** 010758 ** 1. Make sure that cells and freeblocks do not overlap 010759 ** but combine to completely cover the page. 010760 ** 2. Make sure integer cell keys are in order. 010761 ** 3. Check the integrity of overflow pages. 010762 ** 4. Recursively call checkTreePage on all children. 010763 ** 5. Verify that the depth of all children is the same. 010764 */ 010765 static int checkTreePage( 010766 IntegrityCk *pCheck, /* Context for the sanity check */ 010767 Pgno iPage, /* Page number of the page to check */ 010768 i64 *piMinKey, /* Write minimum integer primary key here */ 010769 i64 maxKey /* Error if integer primary key greater than this */ 010770 ){ 010771 MemPage *pPage = 0; /* The page being analyzed */ 010772 int i; /* Loop counter */ 010773 int rc; /* Result code from subroutine call */ 010774 int depth = -1, d2; /* Depth of a subtree */ 010775 int pgno; /* Page number */ 010776 int nFrag; /* Number of fragmented bytes on the page */ 010777 int hdr; /* Offset to the page header */ 010778 int cellStart; /* Offset to the start of the cell pointer array */ 010779 int nCell; /* Number of cells */ 010780 int doCoverageCheck = 1; /* True if cell coverage checking should be done */ 010781 int keyCanBeEqual = 1; /* True if IPK can be equal to maxKey 010782 ** False if IPK must be strictly less than maxKey */ 010783 u8 *data; /* Page content */ 010784 u8 *pCell; /* Cell content */ 010785 u8 *pCellIdx; /* Next element of the cell pointer array */ 010786 BtShared *pBt; /* The BtShared object that owns pPage */ 010787 u32 pc; /* Address of a cell */ 010788 u32 usableSize; /* Usable size of the page */ 010789 u32 contentOffset; /* Offset to the start of the cell content area */ 010790 u32 *heap = 0; /* Min-heap used for checking cell coverage */ 010791 u32 x, prev = 0; /* Next and previous entry on the min-heap */ 010792 const char *saved_zPfx = pCheck->zPfx; 010793 int saved_v1 = pCheck->v1; 010794 int saved_v2 = pCheck->v2; 010795 u8 savedIsInit = 0; 010796 010797 /* Check that the page exists 010798 */ 010799 checkProgress(pCheck); 010800 if( pCheck->mxErr==0 ) goto end_of_check; 010801 pBt = pCheck->pBt; 010802 usableSize = pBt->usableSize; 010803 if( iPage==0 ) return 0; 010804 if( checkRef(pCheck, iPage) ) return 0; 010805 pCheck->zPfx = "Tree %u page %u: "; 010806 pCheck->v1 = iPage; 010807 if( (rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0 ){ 010808 checkAppendMsg(pCheck, 010809 "unable to get the page. error code=%d", rc); 010810 if( rc==SQLITE_IOERR_NOMEM ) pCheck->rc = SQLITE_NOMEM; 010811 goto end_of_check; 010812 } 010813 010814 /* Clear MemPage.isInit to make sure the corruption detection code in 010815 ** btreeInitPage() is executed. */ 010816 savedIsInit = pPage->isInit; 010817 pPage->isInit = 0; 010818 if( (rc = btreeInitPage(pPage))!=0 ){ 010819 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ 010820 checkAppendMsg(pCheck, 010821 "btreeInitPage() returns error code %d", rc); 010822 goto end_of_check; 010823 } 010824 if( (rc = btreeComputeFreeSpace(pPage))!=0 ){ 010825 assert( rc==SQLITE_CORRUPT ); 010826 checkAppendMsg(pCheck, "free space corruption", rc); 010827 goto end_of_check; 010828 } 010829 data = pPage->aData; 010830 hdr = pPage->hdrOffset; 010831 010832 /* Set up for cell analysis */ 010833 pCheck->zPfx = "Tree %u page %u cell %u: "; 010834 contentOffset = get2byteNotZero(&data[hdr+5]); 010835 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ 010836 010837 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the 010838 ** number of cells on the page. */ 010839 nCell = get2byte(&data[hdr+3]); 010840 assert( pPage->nCell==nCell ); 010841 if( pPage->leaf || pPage->intKey==0 ){ 010842 pCheck->nRow += nCell; 010843 } 010844 010845 /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page 010846 ** immediately follows the b-tree page header. */ 010847 cellStart = hdr + 12 - 4*pPage->leaf; 010848 assert( pPage->aCellIdx==&data[cellStart] ); 010849 pCellIdx = &data[cellStart + 2*(nCell-1)]; 010850 010851 if( !pPage->leaf ){ 010852 /* Analyze the right-child page of internal pages */ 010853 pgno = get4byte(&data[hdr+8]); 010854 #ifndef SQLITE_OMIT_AUTOVACUUM 010855 if( pBt->autoVacuum ){ 010856 pCheck->zPfx = "Tree %u page %u right child: "; 010857 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); 010858 } 010859 #endif 010860 depth = checkTreePage(pCheck, pgno, &maxKey, maxKey); 010861 keyCanBeEqual = 0; 010862 }else{ 010863 /* For leaf pages, the coverage check will occur in the same loop 010864 ** as the other cell checks, so initialize the heap. */ 010865 heap = pCheck->heap; 010866 heap[0] = 0; 010867 } 010868 010869 /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte 010870 ** integer offsets to the cell contents. */ 010871 for(i=nCell-1; i>=0 && pCheck->mxErr; i--){ 010872 CellInfo info; 010873 010874 /* Check cell size */ 010875 pCheck->v2 = i; 010876 assert( pCellIdx==&data[cellStart + i*2] ); 010877 pc = get2byteAligned(pCellIdx); 010878 pCellIdx -= 2; 010879 if( pc<contentOffset || pc>usableSize-4 ){ 010880 checkAppendMsg(pCheck, "Offset %u out of range %u..%u", 010881 pc, contentOffset, usableSize-4); 010882 doCoverageCheck = 0; 010883 continue; 010884 } 010885 pCell = &data[pc]; 010886 pPage->xParseCell(pPage, pCell, &info); 010887 if( pc+info.nSize>usableSize ){ 010888 checkAppendMsg(pCheck, "Extends off end of page"); 010889 doCoverageCheck = 0; 010890 continue; 010891 } 010892 010893 /* Check for integer primary key out of range */ 010894 if( pPage->intKey ){ 010895 if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){ 010896 checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey); 010897 } 010898 maxKey = info.nKey; 010899 keyCanBeEqual = 0; /* Only the first key on the page may ==maxKey */ 010900 } 010901 010902 /* Check the content overflow list */ 010903 if( info.nPayload>info.nLocal ){ 010904 u32 nPage; /* Number of pages on the overflow chain */ 010905 Pgno pgnoOvfl; /* First page of the overflow chain */ 010906 assert( pc + info.nSize - 4 <= usableSize ); 010907 nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4); 010908 pgnoOvfl = get4byte(&pCell[info.nSize - 4]); 010909 #ifndef SQLITE_OMIT_AUTOVACUUM 010910 if( pBt->autoVacuum ){ 010911 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage); 010912 } 010913 #endif 010914 checkList(pCheck, 0, pgnoOvfl, nPage); 010915 } 010916 010917 if( !pPage->leaf ){ 010918 /* Check sanity of left child page for internal pages */ 010919 pgno = get4byte(pCell); 010920 #ifndef SQLITE_OMIT_AUTOVACUUM 010921 if( pBt->autoVacuum ){ 010922 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); 010923 } 010924 #endif 010925 d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey); 010926 keyCanBeEqual = 0; 010927 if( d2!=depth ){ 010928 checkAppendMsg(pCheck, "Child page depth differs"); 010929 depth = d2; 010930 } 010931 }else{ 010932 /* Populate the coverage-checking heap for leaf pages */ 010933 btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1)); 010934 } 010935 } 010936 *piMinKey = maxKey; 010937 010938 /* Check for complete coverage of the page 010939 */ 010940 pCheck->zPfx = 0; 010941 if( doCoverageCheck && pCheck->mxErr>0 ){ 010942 /* For leaf pages, the min-heap has already been initialized and the 010943 ** cells have already been inserted. But for internal pages, that has 010944 ** not yet been done, so do it now */ 010945 if( !pPage->leaf ){ 010946 heap = pCheck->heap; 010947 heap[0] = 0; 010948 for(i=nCell-1; i>=0; i--){ 010949 u32 size; 010950 pc = get2byteAligned(&data[cellStart+i*2]); 010951 size = pPage->xCellSize(pPage, &data[pc]); 010952 btreeHeapInsert(heap, (pc<<16)|(pc+size-1)); 010953 } 010954 } 010955 assert( heap!=0 ); 010956 /* Add the freeblocks to the min-heap 010957 ** 010958 ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header 010959 ** is the offset of the first freeblock, or zero if there are no 010960 ** freeblocks on the page. 010961 */ 010962 i = get2byte(&data[hdr+1]); 010963 while( i>0 ){ 010964 int size, j; 010965 assert( (u32)i<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */ 010966 size = get2byte(&data[i+2]); 010967 assert( (u32)(i+size)<=usableSize ); /* due to btreeComputeFreeSpace() */ 010968 btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1)); 010969 /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a 010970 ** big-endian integer which is the offset in the b-tree page of the next 010971 ** freeblock in the chain, or zero if the freeblock is the last on the 010972 ** chain. */ 010973 j = get2byte(&data[i]); 010974 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of 010975 ** increasing offset. */ 010976 assert( j==0 || j>i+size ); /* Enforced by btreeComputeFreeSpace() */ 010977 assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */ 010978 i = j; 010979 } 010980 /* Analyze the min-heap looking for overlap between cells and/or 010981 ** freeblocks, and counting the number of untracked bytes in nFrag. 010982 ** 010983 ** Each min-heap entry is of the form: (start_address<<16)|end_address. 010984 ** There is an implied first entry the covers the page header, the cell 010985 ** pointer index, and the gap between the cell pointer index and the start 010986 ** of cell content. 010987 ** 010988 ** The loop below pulls entries from the min-heap in order and compares 010989 ** the start_address against the previous end_address. If there is an 010990 ** overlap, that means bytes are used multiple times. If there is a gap, 010991 ** that gap is added to the fragmentation count. 010992 */ 010993 nFrag = 0; 010994 prev = contentOffset - 1; /* Implied first min-heap entry */ 010995 while( btreeHeapPull(heap,&x) ){ 010996 if( (prev&0xffff)>=(x>>16) ){ 010997 checkAppendMsg(pCheck, 010998 "Multiple uses for byte %u of page %u", x>>16, iPage); 010999 break; 011000 }else{ 011001 nFrag += (x>>16) - (prev&0xffff) - 1; 011002 prev = x; 011003 } 011004 } 011005 nFrag += usableSize - (prev&0xffff) - 1; 011006 /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments 011007 ** is stored in the fifth field of the b-tree page header. 011008 ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the 011009 ** number of fragmented free bytes within the cell content area. 011010 */ 011011 if( heap[0]==0 && nFrag!=data[hdr+7] ){ 011012 checkAppendMsg(pCheck, 011013 "Fragmentation of %u bytes reported as %u on page %u", 011014 nFrag, data[hdr+7], iPage); 011015 } 011016 } 011017 011018 end_of_check: 011019 if( !doCoverageCheck ) pPage->isInit = savedIsInit; 011020 releasePage(pPage); 011021 pCheck->zPfx = saved_zPfx; 011022 pCheck->v1 = saved_v1; 011023 pCheck->v2 = saved_v2; 011024 return depth+1; 011025 } 011026 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 011027 011028 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 011029 /* 011030 ** This routine does a complete check of the given BTree file. aRoot[] is 011031 ** an array of pages numbers were each page number is the root page of 011032 ** a table. nRoot is the number of entries in aRoot. 011033 ** 011034 ** A read-only or read-write transaction must be opened before calling 011035 ** this function. 011036 ** 011037 ** Write the number of error seen in *pnErr. Except for some memory 011038 ** allocation errors, an error message held in memory obtained from 011039 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is 011040 ** returned. If a memory allocation error occurs, NULL is returned. 011041 ** 011042 ** If the first entry in aRoot[] is 0, that indicates that the list of 011043 ** root pages is incomplete. This is a "partial integrity-check". This 011044 ** happens when performing an integrity check on a single table. The 011045 ** zero is skipped, of course. But in addition, the freelist checks 011046 ** and the checks to make sure every page is referenced are also skipped, 011047 ** since obviously it is not possible to know which pages are covered by 011048 ** the unverified btrees. Except, if aRoot[1] is 1, then the freelist 011049 ** checks are still performed. 011050 */ 011051 int sqlite3BtreeIntegrityCheck( 011052 sqlite3 *db, /* Database connection that is running the check */ 011053 Btree *p, /* The btree to be checked */ 011054 Pgno *aRoot, /* An array of root pages numbers for individual trees */ 011055 Mem *aCnt, /* Memory cells to write counts for each tree to */ 011056 int nRoot, /* Number of entries in aRoot[] */ 011057 int mxErr, /* Stop reporting errors after this many */ 011058 int *pnErr, /* OUT: Write number of errors seen to this variable */ 011059 char **pzOut /* OUT: Write the error message string here */ 011060 ){ 011061 Pgno i; 011062 IntegrityCk sCheck; 011063 BtShared *pBt = p->pBt; 011064 u64 savedDbFlags = pBt->db->flags; 011065 char zErr[100]; 011066 int bPartial = 0; /* True if not checking all btrees */ 011067 int bCkFreelist = 1; /* True to scan the freelist */ 011068 VVA_ONLY( int nRef ); 011069 011070 assert( nRoot>0 ); 011071 assert( aCnt!=0 ); 011072 011073 /* aRoot[0]==0 means this is a partial check */ 011074 if( aRoot[0]==0 ){ 011075 assert( nRoot>1 ); 011076 bPartial = 1; 011077 if( aRoot[1]!=1 ) bCkFreelist = 0; 011078 } 011079 011080 sqlite3BtreeEnter(p); 011081 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); 011082 VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) ); 011083 assert( nRef>=0 ); 011084 memset(&sCheck, 0, sizeof(sCheck)); 011085 sCheck.db = db; 011086 sCheck.pBt = pBt; 011087 sCheck.pPager = pBt->pPager; 011088 sCheck.nCkPage = btreePagecount(sCheck.pBt); 011089 sCheck.mxErr = mxErr; 011090 sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH); 011091 sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL; 011092 if( sCheck.nCkPage==0 ){ 011093 goto integrity_ck_cleanup; 011094 } 011095 011096 sCheck.aPgRef = sqlite3MallocZero((sCheck.nCkPage / 8)+ 1); 011097 if( !sCheck.aPgRef ){ 011098 checkOom(&sCheck); 011099 goto integrity_ck_cleanup; 011100 } 011101 sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize ); 011102 if( sCheck.heap==0 ){ 011103 checkOom(&sCheck); 011104 goto integrity_ck_cleanup; 011105 } 011106 011107 i = PENDING_BYTE_PAGE(pBt); 011108 if( i<=sCheck.nCkPage ) setPageReferenced(&sCheck, i); 011109 011110 /* Check the integrity of the freelist 011111 */ 011112 if( bCkFreelist ){ 011113 sCheck.zPfx = "Freelist: "; 011114 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 011115 get4byte(&pBt->pPage1->aData[36])); 011116 sCheck.zPfx = 0; 011117 } 011118 011119 /* Check all the tables. 011120 */ 011121 #ifndef SQLITE_OMIT_AUTOVACUUM 011122 if( !bPartial ){ 011123 if( pBt->autoVacuum ){ 011124 Pgno mx = 0; 011125 Pgno mxInHdr; 011126 for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i]; 011127 mxInHdr = get4byte(&pBt->pPage1->aData[52]); 011128 if( mx!=mxInHdr ){ 011129 checkAppendMsg(&sCheck, 011130 "max rootpage (%u) disagrees with header (%u)", 011131 mx, mxInHdr 011132 ); 011133 } 011134 }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){ 011135 checkAppendMsg(&sCheck, 011136 "incremental_vacuum enabled with a max rootpage of zero" 011137 ); 011138 } 011139 } 011140 #endif 011141 testcase( pBt->db->flags & SQLITE_CellSizeCk ); 011142 pBt->db->flags &= ~(u64)SQLITE_CellSizeCk; 011143 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ 011144 sCheck.nRow = 0; 011145 if( aRoot[i] ){ 011146 i64 notUsed; 011147 #ifndef SQLITE_OMIT_AUTOVACUUM 011148 if( pBt->autoVacuum && aRoot[i]>1 && !bPartial ){ 011149 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0); 011150 } 011151 #endif 011152 sCheck.v0 = aRoot[i]; 011153 checkTreePage(&sCheck, aRoot[i], ¬Used, LARGEST_INT64); 011154 } 011155 sqlite3MemSetArrayInt64(aCnt, i, sCheck.nRow); 011156 } 011157 pBt->db->flags = savedDbFlags; 011158 011159 /* Make sure every page in the file is referenced 011160 */ 011161 if( !bPartial ){ 011162 for(i=1; i<=sCheck.nCkPage && sCheck.mxErr; i++){ 011163 #ifdef SQLITE_OMIT_AUTOVACUUM 011164 if( getPageReferenced(&sCheck, i)==0 ){ 011165 checkAppendMsg(&sCheck, "Page %u: never used", i); 011166 } 011167 #else 011168 /* If the database supports auto-vacuum, make sure no tables contain 011169 ** references to pointer-map pages. 011170 */ 011171 if( getPageReferenced(&sCheck, i)==0 && 011172 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 011173 checkAppendMsg(&sCheck, "Page %u: never used", i); 011174 } 011175 if( getPageReferenced(&sCheck, i)!=0 && 011176 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 011177 checkAppendMsg(&sCheck, "Page %u: pointer map referenced", i); 011178 } 011179 #endif 011180 } 011181 } 011182 011183 /* Clean up and report errors. 011184 */ 011185 integrity_ck_cleanup: 011186 sqlite3PageFree(sCheck.heap); 011187 sqlite3_free(sCheck.aPgRef); 011188 *pnErr = sCheck.nErr; 011189 if( sCheck.nErr==0 ){ 011190 sqlite3_str_reset(&sCheck.errMsg); 011191 *pzOut = 0; 011192 }else{ 011193 *pzOut = sqlite3StrAccumFinish(&sCheck.errMsg); 011194 } 011195 /* Make sure this analysis did not leave any unref() pages. */ 011196 assert( nRef==sqlite3PagerRefcount(pBt->pPager) ); 011197 sqlite3BtreeLeave(p); 011198 return sCheck.rc; 011199 } 011200 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 011201 011202 /* 011203 ** Return the full pathname of the underlying database file. Return 011204 ** an empty string if the database is in-memory or a TEMP database. 011205 ** 011206 ** The pager filename is invariant as long as the pager is 011207 ** open so it is safe to access without the BtShared mutex. 011208 */ 011209 const char *sqlite3BtreeGetFilename(Btree *p){ 011210 assert( p->pBt->pPager!=0 ); 011211 return sqlite3PagerFilename(p->pBt->pPager, 1); 011212 } 011213 011214 /* 011215 ** Return the pathname of the journal file for this database. The return 011216 ** value of this routine is the same regardless of whether the journal file 011217 ** has been created or not. 011218 ** 011219 ** The pager journal filename is invariant as long as the pager is 011220 ** open so it is safe to access without the BtShared mutex. 011221 */ 011222 const char *sqlite3BtreeGetJournalname(Btree *p){ 011223 assert( p->pBt->pPager!=0 ); 011224 return sqlite3PagerJournalname(p->pBt->pPager); 011225 } 011226 011227 /* 011228 ** Return one of SQLITE_TXN_NONE, SQLITE_TXN_READ, or SQLITE_TXN_WRITE 011229 ** to describe the current transaction state of Btree p. 011230 */ 011231 int sqlite3BtreeTxnState(Btree *p){ 011232 assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); 011233 return p ? p->inTrans : 0; 011234 } 011235 011236 #ifndef SQLITE_OMIT_WAL 011237 /* 011238 ** Run a checkpoint on the Btree passed as the first argument. 011239 ** 011240 ** Return SQLITE_LOCKED if this or any other connection has an open 011241 ** transaction on the shared-cache the argument Btree is connected to. 011242 ** 011243 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART. 011244 */ 011245 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){ 011246 int rc = SQLITE_OK; 011247 if( p ){ 011248 BtShared *pBt = p->pBt; 011249 sqlite3BtreeEnter(p); 011250 if( pBt->inTransaction!=TRANS_NONE ){ 011251 rc = SQLITE_LOCKED; 011252 }else{ 011253 rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt); 011254 } 011255 sqlite3BtreeLeave(p); 011256 } 011257 return rc; 011258 } 011259 #endif 011260 011261 /* 011262 ** Return true if there is currently a backup running on Btree p. 011263 */ 011264 int sqlite3BtreeIsInBackup(Btree *p){ 011265 assert( p ); 011266 assert( sqlite3_mutex_held(p->db->mutex) ); 011267 return p->nBackup!=0; 011268 } 011269 011270 /* 011271 ** This function returns a pointer to a blob of memory associated with 011272 ** a single shared-btree. The memory is used by client code for its own 011273 ** purposes (for example, to store a high-level schema associated with 011274 ** the shared-btree). The btree layer manages reference counting issues. 011275 ** 011276 ** The first time this is called on a shared-btree, nBytes bytes of memory 011277 ** are allocated, zeroed, and returned to the caller. For each subsequent 011278 ** call the nBytes parameter is ignored and a pointer to the same blob 011279 ** of memory returned. 011280 ** 011281 ** If the nBytes parameter is 0 and the blob of memory has not yet been 011282 ** allocated, a null pointer is returned. If the blob has already been 011283 ** allocated, it is returned as normal. 011284 ** 011285 ** Just before the shared-btree is closed, the function passed as the 011286 ** xFree argument when the memory allocation was made is invoked on the 011287 ** blob of allocated memory. The xFree function should not call sqlite3_free() 011288 ** on the memory, the btree layer does that. 011289 */ 011290 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 011291 BtShared *pBt = p->pBt; 011292 sqlite3BtreeEnter(p); 011293 if( !pBt->pSchema && nBytes ){ 011294 pBt->pSchema = sqlite3DbMallocZero(0, nBytes); 011295 pBt->xFreeSchema = xFree; 011296 } 011297 sqlite3BtreeLeave(p); 011298 return pBt->pSchema; 011299 } 011300 011301 /* 011302 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 011303 ** btree as the argument handle holds an exclusive lock on the 011304 ** sqlite_schema table. Otherwise SQLITE_OK. 011305 */ 011306 int sqlite3BtreeSchemaLocked(Btree *p){ 011307 int rc; 011308 assert( sqlite3_mutex_held(p->db->mutex) ); 011309 sqlite3BtreeEnter(p); 011310 rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK); 011311 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE ); 011312 sqlite3BtreeLeave(p); 011313 return rc; 011314 } 011315 011316 011317 #ifndef SQLITE_OMIT_SHARED_CACHE 011318 /* 011319 ** Obtain a lock on the table whose root page is iTab. The 011320 ** lock is a write lock if isWritelock is true or a read lock 011321 ** if it is false. 011322 */ 011323 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 011324 int rc = SQLITE_OK; 011325 assert( p->inTrans!=TRANS_NONE ); 011326 if( p->sharable ){ 011327 u8 lockType = READ_LOCK + isWriteLock; 011328 assert( READ_LOCK+1==WRITE_LOCK ); 011329 assert( isWriteLock==0 || isWriteLock==1 ); 011330 011331 sqlite3BtreeEnter(p); 011332 rc = querySharedCacheTableLock(p, iTab, lockType); 011333 if( rc==SQLITE_OK ){ 011334 rc = setSharedCacheTableLock(p, iTab, lockType); 011335 } 011336 sqlite3BtreeLeave(p); 011337 } 011338 return rc; 011339 } 011340 #endif 011341 011342 #ifndef SQLITE_OMIT_INCRBLOB 011343 /* 011344 ** Argument pCsr must be a cursor opened for writing on an 011345 ** INTKEY table currently pointing at a valid table entry. 011346 ** This function modifies the data stored as part of that entry. 011347 ** 011348 ** Only the data content may only be modified, it is not possible to 011349 ** change the length of the data stored. If this function is called with 011350 ** parameters that attempt to write past the end of the existing data, 011351 ** no modifications are made and SQLITE_CORRUPT is returned. 011352 */ 011353 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 011354 int rc; 011355 assert( cursorOwnsBtShared(pCsr) ); 011356 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); 011357 assert( pCsr->curFlags & BTCF_Incrblob ); 011358 011359 rc = restoreCursorPosition(pCsr); 011360 if( rc!=SQLITE_OK ){ 011361 return rc; 011362 } 011363 assert( pCsr->eState!=CURSOR_REQUIRESEEK ); 011364 if( pCsr->eState!=CURSOR_VALID ){ 011365 return SQLITE_ABORT; 011366 } 011367 011368 /* Save the positions of all other cursors open on this table. This is 011369 ** required in case any of them are holding references to an xFetch 011370 ** version of the b-tree page modified by the accessPayload call below. 011371 ** 011372 ** Note that pCsr must be open on a INTKEY table and saveCursorPosition() 011373 ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence 011374 ** saveAllCursors can only return SQLITE_OK. 011375 */ 011376 VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr); 011377 assert( rc==SQLITE_OK ); 011378 011379 /* Check some assumptions: 011380 ** (a) the cursor is open for writing, 011381 ** (b) there is a read/write transaction open, 011382 ** (c) the connection holds a write-lock on the table (if required), 011383 ** (d) there are no conflicting read-locks, and 011384 ** (e) the cursor points at a valid row of an intKey table. 011385 */ 011386 if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){ 011387 return SQLITE_READONLY; 011388 } 011389 assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0 011390 && pCsr->pBt->inTransaction==TRANS_WRITE ); 011391 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) ); 011392 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) ); 011393 assert( pCsr->pPage->intKey ); 011394 011395 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1); 011396 } 011397 011398 /* 011399 ** Mark this cursor as an incremental blob cursor. 011400 */ 011401 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){ 011402 pCur->curFlags |= BTCF_Incrblob; 011403 pCur->pBtree->hasIncrblobCur = 1; 011404 } 011405 #endif 011406 011407 /* 011408 ** Set both the "read version" (single byte at byte offset 18) and 011409 ** "write version" (single byte at byte offset 19) fields in the database 011410 ** header to iVersion. 011411 */ 011412 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ 011413 BtShared *pBt = pBtree->pBt; 011414 int rc; /* Return code */ 011415 011416 assert( iVersion==1 || iVersion==2 ); 011417 011418 /* If setting the version fields to 1, do not automatically open the 011419 ** WAL connection, even if the version fields are currently set to 2. 011420 */ 011421 pBt->btsFlags &= ~BTS_NO_WAL; 011422 if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL; 011423 011424 rc = sqlite3BtreeBeginTrans(pBtree, 0, 0); 011425 if( rc==SQLITE_OK ){ 011426 u8 *aData = pBt->pPage1->aData; 011427 if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){ 011428 rc = sqlite3BtreeBeginTrans(pBtree, 2, 0); 011429 if( rc==SQLITE_OK ){ 011430 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 011431 if( rc==SQLITE_OK ){ 011432 aData[18] = (u8)iVersion; 011433 aData[19] = (u8)iVersion; 011434 } 011435 } 011436 } 011437 } 011438 011439 pBt->btsFlags &= ~BTS_NO_WAL; 011440 return rc; 011441 } 011442 011443 /* 011444 ** Return true if the cursor has a hint specified. This routine is 011445 ** only used from within assert() statements 011446 */ 011447 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){ 011448 return (pCsr->hints & mask)!=0; 011449 } 011450 011451 /* 011452 ** Return true if the given Btree is read-only. 011453 */ 011454 int sqlite3BtreeIsReadonly(Btree *p){ 011455 return (p->pBt->btsFlags & BTS_READ_ONLY)!=0; 011456 } 011457 011458 /* 011459 ** Return the size of the header added to each page by this module. 011460 */ 011461 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); } 011462 011463 /* 011464 ** If no transaction is active and the database is not a temp-db, clear 011465 ** the in-memory pager cache. 011466 */ 011467 void sqlite3BtreeClearCache(Btree *p){ 011468 BtShared *pBt = p->pBt; 011469 if( pBt->inTransaction==TRANS_NONE ){ 011470 sqlite3PagerClearCache(pBt->pPager); 011471 } 011472 } 011473 011474 #if !defined(SQLITE_OMIT_SHARED_CACHE) 011475 /* 011476 ** Return true if the Btree passed as the only argument is sharable. 011477 */ 011478 int sqlite3BtreeSharable(Btree *p){ 011479 return p->sharable; 011480 } 011481 011482 /* 011483 ** Return the number of connections to the BtShared object accessed by 011484 ** the Btree handle passed as the only argument. For private caches 011485 ** this is always 1. For shared caches it may be 1 or greater. 011486 */ 011487 int sqlite3BtreeConnectionCount(Btree *p){ 011488 testcase( p->sharable ); 011489 return p->pBt->nRef; 011490 } 011491 #endif