 /*- 7  * See the file LICENSE for redistribution information.   *!  * Copyright (c) 1996, 1997, 1998 ,  *	Sleepycat Software.  All rights reserved.  */  #include "config.h"    #ifndef lintF static const char sccsid[] = "@(#)mp_bh.c	10.45 (Sleepycat) 11/25/98"; #endif /* not lint */    #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h>   #include <errno.h> #include <string.h>  #include <unistd.h>  #endif   #include "db_int.h"  #include "shqueue.h" #include "db_shash.h"  #include "mp.h"  #include "common_ext.h"   I static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *));    /*  * __memp_bhwrite --8  *	Write the page associated with a given bucket header.  *  * PUBLIC: int __memp_bhwrite B  * PUBLIC:     __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));  */  int 0 __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) 	DB_MPOOL *dbmp; 	MPOOLFILE *mfp;	 	BH *bhp;  	int *restartp, *wrotep; {  	DB_MPOOLFILE *dbmfp;  	DB_MPREG *mpreg;  	int incremented, ret;   	if (restartp != NULL) 		*restartp = 0; 	if (wrotep != NULL) 		*wrotep = 0; 	incremented = 0;    	/* F 	 * Walk the process' DB_MPOOLFILE list and find a file descriptor forE 	 * the file.  We also check that the descriptor is open for writing. G 	 * If we find a descriptor on the file that's not open for writing, we H 	 * try and upgrade it to make it writeable.  If that fails, we're done. 	 */  	LOCKHANDLE(dbmp, dbmp->mutexp);( 	for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);1 	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))  		if (dbmfp->mfp == mfp) {% 			if (F_ISSET(dbmfp, MP_READONLY) && * 			    __memp_upgrade(dbmp, dbmfp, mfp)) {% 				UNLOCKHANDLE(dbmp, dbmp->mutexp);  				return (0);  			}   			/* 9 			 * Increment the reference count -- see the comment in  			 * memp_fclose(). 			 */ 			++dbmfp->ref; 			incremented = 1; 	 			break;  		} " 	UNLOCKHANDLE(dbmp, dbmp->mutexp); 	if (dbmfp != NULL) 
 		goto found;    	/* C 	 * It's not a page from a file we've opened.  If the file requires D 	 * input/output processing, see if this process has ever registeredF 	 * information as to how to write this type of file.  If not, there's 	 * nothing we can do. 	 */ 	if (mfp->ftype != 0) { ! 		LOCKHANDLE(dbmp, dbmp->mutexp); ) 		for (mpreg = LIST_FIRST(&dbmp->dbregq); 1 		    mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) " 			if (mpreg->ftype == mfp->ftype)
 				break;# 		UNLOCKHANDLE(dbmp, dbmp->mutexp);  		if (mpreg == NULL) 			return (0); 	}   	/* C 	 * Try and open the file, attaching to the underlying shared area.  	 *  	 * XXX F 	 * Don't try to attach to temporary files.  There are two problems inG 	 * trying to do that.  First, if we have different privileges than the G 	 * process that "owns" the temporary file, we might create the backing F 	 * disk file such that the owning process couldn't read/write its ownG 	 * buffers, e.g., memp_trickle() running as root creating a file owned F 	 * as root, mode 600.  Second, if the temporary file has already beenH 	 * created, we don't have any way of finding out what its real name is,E 	 * and, even if we did, it was already unlinked (so that it won't be H 	 * left if the process dies horribly).  This decision causes a problem,D 	 * however: if the temporary file consumes the entire buffer cache,D 	 * and the owner doesn't flush the buffers to disk, we could end upG 	 * with resource starvation, and the memp_trickle() thread couldn't do B 	 * anything about it.  That's a pretty unlikely scenario, though. 	 *  	 * XXX F 	 * There's no negative cache, so we may repeatedly try and open files7 	 * that we have previously tried (and failed) to open.  	 * 8 	 * Ignore any error, assume it's a permissions problem. 	 */ 	if (F_ISSET(mfp, MP_TEMP)) 
 		return (0);   9 	if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp, mfp->path_off), 8 	    0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0)
 		return (0);   : found:	ret = __memp_pgwrite(dbmfp, bhp, restartp, wrotep);   	if (incremented) { ! 		LOCKHANDLE(dbmp, dbmp->mutexp);  		--dbmfp->ref; # 		UNLOCKHANDLE(dbmp, dbmp->mutexp);  	}   	return (ret); }    /*  * __memp_pgread --   *	Read a page from a file.   *>  * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));  */  int % __memp_pgread(dbmfp, bhp, can_create)  	DB_MPOOLFILE *dbmfp; 	 	BH *bhp;  	int can_create; { 
 	DB_IO db_io;  	DB_MPOOL *dbmp; 	MPOOLFILE *mfp; 	size_t len, pagesize; 	ssize_t nr; 	int created, ret;   	dbmp = dbmfp->dbmp; 	mfp = dbmfp->mfp;" 	pagesize = mfp->stat.st_pagesize;  " 	F_SET(bhp, BH_LOCKED | BH_TRASH); 	LOCKBUFFER(dbmp, bhp);  	UNLOCKREGION(dbmp);   	/* C 	 * Temporary files may not yet have been created.  We don't create ? 	 * them now, we create them when the pages have to be flushed.  	 */ 	nr = 0; 	if (dbmfp->fd == -1) 
 		ret = 0; 	else {  		/*A 		 * Ignore read errors if we have permission to create the page. A 		 * Assume that the page doesn't exist, and that we'll create it  		 * when we write it out. 		 */  		db_io.fd_io = dbmfp->fd;# 		db_io.fd_lock = dbmp->reginfo.fd;  		db_io.mutexp =: 		    F_ISSET(dbmp, MP_LOCKHANDLE) ? dbmfp->mutexp : NULL;* 		db_io.pagesize = db_io.bytes = pagesize; 		db_io.pgno = bhp->pgno;  		db_io.buf = bhp->buf;   ) 		ret = __os_io(&db_io, DB_IO_READ, &nr);  	}  
 	created = 0;  	if (nr < (ssize_t)pagesize) 		if (can_create)  			created = 1;  		else {. 			/* If we had a short read, ret may be 0. */ 			if (ret == 0) 				ret = EIO; 			__db_err(dbmp->dbenv,9 			    "%s: page %lu doesn't exist, create flag not set", , 			    __memp_fn(dbmfp), (u_long)bhp->pgno); 			goto err; 		}    	/* E 	 * Clear any bytes we didn't read that need to be cleared.  If we're D 	 * running in diagnostic mode, smash any bytes on the page that are& 	 * unknown quantities for the caller. 	 */ 	if (nr != (ssize_t)pagesize) { 8 		len = mfp->clear_len == 0 ? pagesize : mfp->clear_len; 		if (nr < (ssize_t)len)& 			memset(bhp->buf + nr, 0, len - nr); #ifdef DIAGNOSTIC  		if (nr > (ssize_t)len) 			len = nr; 		if (len < pagesize) 0 			memset(bhp->buf + len, 0xdb, pagesize - len); #endif 	}   	/* Call any pgin function. */6 	ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);  7 	/* Unlock the buffer and reacquire the region lock. */  err:	UNLOCKBUFFER(dbmp, bhp);  	LOCKREGION(dbmp);   	/* D 	 * If no errors occurred, the data is now valid, clear the BH_TRASHG 	 * flag; regardless, clear the lock bit and let other threads proceed.  	 */ 	F_CLR(bhp, BH_LOCKED);  	if (ret == 0) { 		F_CLR(bhp, BH_TRASH);    		/* Update the statistics. */ 		if (created) {# 			++dbmp->mp->stat.st_page_create;  			++mfp->stat.st_page_create;
 		} else { 			++dbmp->mp->stat.st_page_in;  			++mfp->stat.st_page_in; 		}  	}   	return (ret); }    /*  * __memp_pgwrite --  *	Write a page to a file.  *H  * PUBLIC: int __memp_pgwrite __P((DB_MPOOLFILE *, BH *, int *, int *));  */  int , __memp_pgwrite(dbmfp, bhp, restartp, wrotep) 	DB_MPOOLFILE *dbmfp; 	 	BH *bhp;  	int *restartp, *wrotep; {  	DB_ENV *dbenv; 
 	DB_IO db_io;  	DB_LOG *lg_info;  	DB_LSN lsn; 	DB_MPOOL *dbmp; 	MPOOL *mp;  	MPOOLFILE *mfp; 	ssize_t nw; 	int callpgin, ret, syncfail;  	const char *fail;   	dbmp = dbmfp->dbmp; 	dbenv = dbmp->dbenv;  	mp = dbmp->mp;  	mfp = dbmfp->mfp;   	if (restartp != NULL) 		*restartp = 0; 	if (wrotep != NULL) 		*wrotep = 0; 	callpgin = 0;   	/* E 	 * Check the dirty bit -- this buffer may have been written since we  	 * decided to write it. 	 */ 	if (!F_ISSET(bhp, BH_DIRTY)) {  		if (wrotep != NULL)  			*wrotep = 1; 
 		return (0);  	}   	LOCKBUFFER(dbmp, bhp);    	/* F 	 * If there were two writers, we may have just been waiting while theG 	 * other writer completed I/O on this buffer.  Check the dirty bit one  	 * more time. 	 */ 	if (!F_ISSET(bhp, BH_DIRTY)) {  		UNLOCKBUFFER(dbmp, bhp);   		if (wrotep != NULL)  			*wrotep = 1; 
 		return (0);  	}   	F_SET(bhp, BH_LOCKED);  	UNLOCKREGION(dbmp);   	if (restartp != NULL) 		*restartp = 1;  ; 	/* Copy the LSN off the page if we're going to need it. */  	lg_info = dbenv->lg_info;/ 	if (lg_info != NULL || F_ISSET(bhp, BH_WRITE)) 8 		memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));  6 	/* Ensure the appropriate log records are on disk. */> 	if (lg_info != NULL && (ret = log_flush(lg_info, &lsn)) != 0) 		goto err;c   	/*cF 	 * Call any pgout function.  We set the callpgin flag so that we flagG 	 * that the contents of the buffer will need to be passed through pgine 	 * before they are reused.h 	 */ 	if (mfp->ftype == 0)d
 		ret = 0; 	else {t 		callpgin = 1;h, 		if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0) 			goto err; 	}  5 	/* Temporary files may not yet have been created. */, 	if (dbmfp->fd == -1) {E" 		LOCKHANDLE(dbmp, dbmfp->mutexp);4 		if (dbmfp->fd == -1 && ((ret = __db_appname(dbenv,A 		    DB_APP_TMP, NULL, NULL, DB_CREATE | DB_EXCL | DB_TEMPORARY,L4 		    &dbmfp->fd, NULL)) != 0 || dbmfp->fd == -1)) {% 			UNLOCKHANDLE(dbmp, dbmfp->mutexp);w 			__db_err(dbenv,2 			    "unable to create temporary backing file"); 			goto err; 		}P$ 		UNLOCKHANDLE(dbmp, dbmfp->mutexp); 	}   	/* Write the page. */ 	db_io.fd_io = dbmfp->fd;a" 	db_io.fd_lock = dbmp->reginfo.fd;D 	db_io.mutexp = F_ISSET(dbmp, MP_LOCKHANDLE) ? dbmfp->mutexp : NULL;6 	db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize; 	db_io.pgno = bhp->pgno; 	db_io.buf = bhp->buf;6 	if ((ret = __os_io(&db_io, DB_IO_WRITE, &nw)) != 0) { 		__db_panic(dbenv, ret);  		fail = "write";t 		goto syserr; 	}, 	if (nw != (ssize_t)mfp->stat.st_pagesize) { 		ret = EIO; 		fail = "write";b 		goto syserr; 	}   	if (wrotep != NULL) 		*wrotep = 1;  7 	/* Unlock the buffer and reacquire the region lock. */  	UNLOCKBUFFER(dbmp, bhp);) 	LOCKREGION(dbmp);   	/*(3 	 * Clean up the flags based on a successful write.t 	 * ? 	 * If we rewrote the page, it will need processing by the pgin  	 * routine before reuse.  	 */ 	if (callpgin) 		F_SET(bhp, BH_CALLPGIN);" 	F_CLR(bhp, BH_DIRTY | BH_LOCKED);   	/*LB 	 * If we write a buffer for which a checkpoint is waiting, updateB 	 * the count of pending buffers (both in the mpool as a whole andC 	 * for this file).  If the count for this file goes to zero, flush  	 * the writes.n 	 *t 	 * XXX:D 	 * Don't lock the region around the sync, fsync(2) has no atomicity 	 * issues.  	 *> 	 * XXX:D 	 * We ignore errors from the sync -- it makes no sense to return anF 	 * error to the calling process, so set a flag causing the checkpoint 	 * to be retried later. 	 */ 	if (F_ISSET(bhp, BH_WRITE)) { 		if (mfp->lsn_cnt == 1) { 			UNLOCKREGION(dbmp);) 			syncfail = __os_fsync(dbmfp->fd) != 0;a 			LOCKREGION(dbmp); 			if (syncfail) 				F_SET(mp, MP_LSN_RETRY);   		}t   		F_CLR(bhp, BH_WRITE);e   		/*A 		 * If the buffer just written has a larger LSN than the current A 		 * max LSN written for this checkpoint, update the saved value.e 		 */c& 		if (log_compare(&lsn, &mp->lsn) > 0) 			mp->lsn = lsn;e   		--mp->lsn_cnt; 		--mfp->lsn_cnt;f 	}  . 	/* Update the page clean/dirty statistics. */ 	++mp->stat.st_page_clean; 	--mp->stat.st_page_dirty;   	/* Update I/O statistics. */r 	++mp->stat.st_page_out; 	++mfp->stat.st_page_out;r   	return (0);  5 syserr:	__db_err(dbenv, "%s: %s failed for page %lu",t0 	    __memp_fn(dbmfp), fail, (u_long)bhp->pgno);  ; err:	/* Unlock the buffer and reacquire the region lock. */r 	UNLOCKBUFFER(dbmp, bhp);b 	LOCKREGION(dbmp);   	/*w* 	 * Clean up the flags based on a failure. 	 *nE 	 * The page remains dirty but we remove our lock.  If we rewrote thenC 	 * page, it will need processing by the pgin routine before reuse.o 	 */ 	if (callpgin) 		F_SET(bhp, BH_CALLPGIN); 	F_CLR(bhp, BH_LOCKED);r   	return (ret); }    /*  * __memp_pg --   *	Call the pgin/pgout routine.   *:  * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int));  */  intS __memp_pg(dbmfp, bhp, is_pgin) 	DB_MPOOLFILE *dbmfp;(	 	BH *bhp;R
 	int is_pgin;- {t 	DBT dbt, *dbtp; 	DB_MPOOL *dbmp; 	DB_MPREG *mpreg;b 	MPOOLFILE *mfp; 	int ftype, ret;   	dbmp = dbmfp->dbmp; 	mfp = dbmfp->mfp;    	LOCKHANDLE(dbmp, dbmp->mutexp);   	ftype = mfp->ftype;( 	for (mpreg = LIST_FIRST(&dbmp->dbregq);2 	    mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) { 		if (ftype != mpreg->ftype) 			continue; 		if (mfp->pgcookie_len == 0)_ 			dbtp = NULL;( 		else {  			dbt.size = mfp->pgcookie_len;. 			dbt.data = R_ADDR(dbmp, mfp->pgcookie_off); 			dbtp = &dbt;  		} # 		UNLOCKHANDLE(dbmp, dbmp->mutexp);    		if (is_pgin) {$ 			if (mpreg->pgin != NULL && (ret =4 			    mpreg->pgin(bhp->pgno, bhp->buf, dbtp)) != 0)
 				goto err;d 		} else% 			if (mpreg->pgout != NULL && (ret = 5 			    mpreg->pgout(bhp->pgno, bhp->buf, dbtp)) != 0)h
 				goto err;G 		break; 	}   	if (mpreg == NULL)e# 		UNLOCKHANDLE(dbmp, dbmp->mutexp);d   	return (0);  & err:	UNLOCKHANDLE(dbmp, dbmp->mutexp);4 	__db_err(dbmp->dbenv, "%s: %s failed for page %lu",F 	    __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno); 	return (ret); }    /*  * __memp_bhfree --t0  *	Free a bucket header and its referenced data.  *H  * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int));  */  void' __memp_bhfree(dbmp, mfp, bhp, free_mem)S 	DB_MPOOL *dbmp; 	MPOOLFILE *mfp;	 	BH *bhp;; 	int free_mem; {= 	size_t off;  ; 	/* Delete the buffer header from the hash bucket queue. */ 8 	off = BUCKET(dbmp->mp, R_OFFSET(dbmp, mfp), bhp->pgno);2 	SH_TAILQ_REMOVE(&dbmp->htab[off], bhp, hq, __bh);  3 	/* Delete the buffer header from the LRU queue. */ / 	SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh);	   	/* ? 	 * If we're not reusing it immediately, free the buffer headert 	 * and data for real. 	 */ 	if (free_mem) {% 		__db_shalloc_free(dbmp->addr, bhp);	! 		--dbmp->mp->stat.st_page_clean;d 	} }e   /*  * __memp_upgrade --8  *	Upgrade a file descriptor from readonly to readwrite.  */t
 static int  __memp_upgrade(dbmp, dbmfp, mfp) 	DB_MPOOL *dbmp; 	DB_MPOOLFILE *dbmfp;e 	MPOOLFILE *mfp; {-
 	int fd, ret; 
 	char *rpath;p   	/*_ 	 * !!!f. 	 * We expect the handle to already be locked. 	 */  . 	/* Check to see if we've already upgraded. */  	if (F_ISSET(dbmfp, MP_UPGRADE))
 		return (0);e  , 	/* Check to see if we've already failed. */% 	if (F_ISSET(dbmfp, MP_UPGRADE_FAIL)) 
 		return (1);p   	/*?H 	 * Calculate the real name for this file and try to open it read/write.G 	 * We know we have a valid pathname for the file because it's the only/; 	 * way we could have gotten a file descriptor of any kind.h 	 */2 	if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA,? 	    NULL, R_ADDR(dbmp, mfp->path_off), 0, NULL, &rpath)) != 0)r 		return (ret);L+ 	if (__db_open(rpath, 0, 0, 0, &fd) != 0) {i  		F_SET(dbmfp, MP_UPGRADE_FAIL);
 		ret = 1;	 	} else {a6 		/* Swap the descriptors and set the upgrade flag. */ 		(void)__os_close(dbmfp->fd); 		dbmfp->fd = fd;i 		F_SET(dbmfp, MP_UPGRADE);;
 		ret = 0; 	} 	__os_freestr(rpath);i 	return (ret); }. *H  * PUBLIC: int __memp_pgwrite __P((DB_MPOOLFILE *, BH *, int *, int *));  */  int , __memp_pgwrite(dbmfp, bhp, restartp, wrotep) 	DB_MPOOLFILE *dbmfp; 	 	BH *bhp;  	int *restartp, *wrotep; {  	DB_ENV *dbenv; 
 	DB_IO db_io;  	DB_LOG *lg_info;  	DB_LSN lsn; 	DB_MPOOL *dbmp; 	MPOOL *mp;  	MPOOLFI                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                