mirror of
https://github.com/freebsd/freebsd-src
synced 2024-10-15 21:05:08 +00:00
This patch implements O_DIRECT about 80% of the way. It takes a patchset
Tor created a while ago, removes the raw I/O piece (that has cache coherency problems), and adds a buffer cache / VM freeing piece. Essentially this patch causes O_DIRECT I/O to not be left in the cache, but does not prevent it from going through the cache, hence the 80%. For the last 20% we need a method by which the I/O can be issued directly to buffer supplied by the user process and bypass the buffer cache entirely, but still maintain cache coherency. I also have the code working under -stable but the changes made to sys/file.h may not be MFCable, so an MFC is not on the table yet. Submitted by: tegge, dillon
This commit is contained in:
parent
e8f64f5ebf
commit
ac8f990bde
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=77115
|
@ -147,6 +147,11 @@ corresponds to the
|
|||
.Dv O_APPEND
|
||||
flag of
|
||||
.Xr open 2 .
|
||||
.It Dv O_DIRECT
|
||||
Minimize or eliminate the cache effects of reading and writing. The system
|
||||
will attempt to avoid caching the data you read or write. If it cannot
|
||||
avoid caching the data, it will minimize the impact the data has on the cache.
|
||||
Use of this flag can drastically reduce performance if not used with care.
|
||||
.It Dv O_ASYNC
|
||||
Enable the
|
||||
.Dv SIGIO
|
||||
|
|
|
@ -83,6 +83,7 @@ O_TRUNC truncate size to 0
|
|||
O_EXCL error if create and file exists
|
||||
O_SHLOCK atomically obtain a shared lock
|
||||
O_EXLOCK atomically obtain an exclusive lock
|
||||
O_DIRECT eliminate or reduce cache effects
|
||||
O_FSYNC synchronous writes
|
||||
O_NOFOLLOW do not follow symlinks
|
||||
.Ed
|
||||
|
@ -150,6 +151,12 @@ If creating a file with
|
|||
the request for the lock will never fail
|
||||
(provided that the underlying filesystem supports locking).
|
||||
.Pp
|
||||
.Dv O_DIRECT may be used to
|
||||
minimize or eliminate the cache effects of reading and writing. The system
|
||||
will attempt to avoid caching the data you read or write. If it cannot
|
||||
avoid caching the data, it will minimize the impact the data has on the cache.
|
||||
Use of this flag can drastically reduce performance if not used with care.
|
||||
.Pp
|
||||
If successful,
|
||||
.Fn open
|
||||
returns a non-negative integer, termed a file descriptor.
|
||||
|
|
|
@ -1249,7 +1249,7 @@ brelse(struct buf * bp)
|
|||
|
||||
/* unlock */
|
||||
BUF_UNLOCK(bp);
|
||||
bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
|
||||
bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
|
||||
bp->b_ioflags &= ~BIO_ORDERED;
|
||||
if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
|
||||
panic("brelse: not dirty");
|
||||
|
@ -1264,6 +1264,8 @@ brelse(struct buf * bp)
|
|||
* biodone() to requeue an async I/O on completion. It is also used when
|
||||
* known good buffers need to be requeued but we think we may need the data
|
||||
* again soon.
|
||||
*
|
||||
* XXX we should be able to leave the B_RELBUF hint set on completion.
|
||||
*/
|
||||
void
|
||||
bqrelse(struct buf * bp)
|
||||
|
@ -1355,12 +1357,15 @@ vfs_vmio_release(bp)
|
|||
vm_page_flag_clear(m, PG_ZERO);
|
||||
/*
|
||||
* Might as well free the page if we can and it has
|
||||
* no valid data.
|
||||
* no valid data. We also free the page if the
|
||||
* buffer was used for direct I/O
|
||||
*/
|
||||
if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
|
||||
vm_page_busy(m);
|
||||
vm_page_protect(m, VM_PROT_NONE);
|
||||
vm_page_free(m);
|
||||
} else if (bp->b_flags & B_DIRECT) {
|
||||
vm_page_try_to_free(m);
|
||||
} else if (vm_page_count_severe()) {
|
||||
vm_page_try_to_cache(m);
|
||||
}
|
||||
|
|
|
@ -505,6 +505,15 @@ cluster_callback(bp)
|
|||
tbp->b_dirtyoff = tbp->b_dirtyend = 0;
|
||||
tbp->b_flags &= ~B_INVAL;
|
||||
tbp->b_ioflags &= ~BIO_ERROR;
|
||||
/*
|
||||
* XXX the bdwrite()/bqrelse() issued during
|
||||
* cluster building clears B_RELBUF (see bqrelse()
|
||||
* comment). If direct I/O was specified, we have
|
||||
* to restore it here to allow the buffer and VM
|
||||
* to be freed.
|
||||
*/
|
||||
if (tbp->b_flags & B_DIRECT)
|
||||
tbp->b_flags |= B_RELBUF;
|
||||
}
|
||||
bufdone(tbp);
|
||||
}
|
||||
|
|
|
@ -352,6 +352,8 @@ vn_read(fp, uio, cred, flags, p)
|
|||
ioflag = 0;
|
||||
if (fp->f_flag & FNONBLOCK)
|
||||
ioflag |= IO_NDELAY;
|
||||
if (fp->f_flag & O_DIRECT)
|
||||
ioflag |= IO_DIRECT;
|
||||
VOP_LEASE(vp, p, cred, LEASE_READ);
|
||||
vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
|
||||
if ((flags & FOF_OFFSET) == 0)
|
||||
|
@ -393,6 +395,8 @@ vn_write(fp, uio, cred, flags, p)
|
|||
ioflag |= IO_APPEND;
|
||||
if (fp->f_flag & FNONBLOCK)
|
||||
ioflag |= IO_NDELAY;
|
||||
if (fp->f_flag & O_DIRECT)
|
||||
ioflag |= IO_DIRECT;
|
||||
if ((fp->f_flag & O_FSYNC) ||
|
||||
(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
|
||||
ioflag |= IO_SYNC;
|
||||
|
|
|
@ -187,13 +187,17 @@ struct buf {
|
|||
* The buffer's data is always PAGE_SIZE aligned even
|
||||
* if b_bufsize and b_bcount are not. ( b_bufsize is
|
||||
* always at least DEV_BSIZE aligned, though ).
|
||||
*
|
||||
*
|
||||
* B_DIRECT Hint that we should attempt to completely free
|
||||
* the pages underlying the buffer. B_DIRECT is
|
||||
* sticky until the buffer is released and typically
|
||||
* only has an effect when B_RELBUF is also set.
|
||||
*/
|
||||
|
||||
#define B_AGE 0x00000001 /* Move to age queue when I/O done. */
|
||||
#define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */
|
||||
#define B_ASYNC 0x00000004 /* Start I/O, do not wait. */
|
||||
#define B_UNUSED0 0x00000008 /* Old B_BAD */
|
||||
#define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */
|
||||
#define B_DEFERRED 0x00000010 /* Skipped over for cleaning */
|
||||
#define B_CACHE 0x00000020 /* Bread found us in the cache. */
|
||||
#define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */
|
||||
|
@ -225,7 +229,7 @@ struct buf {
|
|||
"\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \
|
||||
"\25read\24raw\23phys\22clusterok\21malloc\20nocache" \
|
||||
"\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \
|
||||
"\10delwri\7call\6cache\4bad\3async\2needcommit\1age"
|
||||
"\10delwri\7call\6cache\4direct\3async\2needcommit\1age"
|
||||
|
||||
/*
|
||||
* These flags are kept in b_xflags.
|
||||
|
|
|
@ -98,15 +98,18 @@
|
|||
/* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */
|
||||
#define O_NOCTTY 0x8000 /* don't assign controlling terminal */
|
||||
|
||||
/* Attempt to bypass buffer cache */
|
||||
#define O_DIRECT 0x00010000
|
||||
|
||||
#ifdef _KERNEL
|
||||
/* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */
|
||||
#define FFLAGS(oflags) ((oflags) + 1)
|
||||
#define OFLAGS(fflags) ((fflags) - 1)
|
||||
|
||||
/* bits to save after open */
|
||||
#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK)
|
||||
#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT)
|
||||
/* bits settable by fcntl(F_SETFL, ...) */
|
||||
#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM)
|
||||
#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT)
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
|
|
@ -56,7 +56,7 @@ struct knote;
|
|||
*/
|
||||
struct file {
|
||||
LIST_ENTRY(file) f_list;/* list of active files */
|
||||
short f_flag; /* see fcntl.h */
|
||||
short f_FILLER3; /* (old f_flag) */
|
||||
#define DTYPE_VNODE 1 /* file */
|
||||
#define DTYPE_SOCKET 2 /* communications endpoint */
|
||||
#define DTYPE_PIPE 3 /* pipe */
|
||||
|
@ -93,6 +93,7 @@ struct file {
|
|||
*/
|
||||
off_t f_offset;
|
||||
caddr_t f_data; /* vnode or socket */
|
||||
u_int f_flag; /* see fcntl.h */
|
||||
};
|
||||
|
||||
#ifdef MALLOC_DECLARE
|
||||
|
|
|
@ -220,6 +220,7 @@ struct vattr {
|
|||
#define IO_VMIO 0x20 /* data already in VMIO space */
|
||||
#define IO_INVAL 0x40 /* invalidate after I/O */
|
||||
#define IO_ASYNC 0x80 /* bawrite rather then bdwrite */
|
||||
#define IO_DIRECT 0x100 /* attempt to bypass buffer cache */
|
||||
|
||||
/*
|
||||
* Modes. Some values same as Ixxx entries from inode.h for now.
|
||||
|
|
|
@ -286,6 +286,15 @@ READ(ap)
|
|||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If IO_DIRECT then set B_DIRECT for the buffer. This
|
||||
* will cause us to attempt to release the buffer later on
|
||||
* and will cause the buffer cache to attempt to free the
|
||||
* underlying pages.
|
||||
*/
|
||||
if (ioflag & IO_DIRECT)
|
||||
bp->b_flags |= B_DIRECT;
|
||||
|
||||
/*
|
||||
* We should only get non-zero b_resid when an I/O error
|
||||
* has occurred, which should cause us to break above.
|
||||
|
@ -328,12 +337,12 @@ READ(ap)
|
|||
if (error)
|
||||
break;
|
||||
|
||||
if ((ioflag & IO_VMIO) &&
|
||||
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
|
||||
(LIST_FIRST(&bp->b_dep) == NULL)) {
|
||||
/*
|
||||
* If there are no dependencies, and
|
||||
* it's VMIO, then we don't need the buf,
|
||||
* mark it available for freeing. The VM has the data.
|
||||
* If there are no dependencies, and it's VMIO,
|
||||
* then we don't need the buf, mark it available
|
||||
* for freeing. The VM has the data.
|
||||
*/
|
||||
bp->b_flags |= B_RELBUF;
|
||||
brelse(bp);
|
||||
|
@ -355,7 +364,7 @@ READ(ap)
|
|||
* so it must have come from a 'break' statement
|
||||
*/
|
||||
if (bp != NULL) {
|
||||
if ((ioflag & IO_VMIO) &&
|
||||
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
|
||||
(LIST_FIRST(&bp->b_dep) == NULL)) {
|
||||
bp->b_flags |= B_RELBUF;
|
||||
brelse(bp);
|
||||
|
@ -514,6 +523,8 @@ WRITE(ap)
|
|||
ap->a_cred, flags, &bp);
|
||||
if (error != 0)
|
||||
break;
|
||||
if (ioflag & IO_DIRECT)
|
||||
bp->b_flags |= B_DIRECT;
|
||||
|
||||
if (uio->uio_offset + xfersize > ip->i_size) {
|
||||
ip->i_size = uio->uio_offset + xfersize;
|
||||
|
@ -526,10 +537,18 @@ WRITE(ap)
|
|||
|
||||
error =
|
||||
uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
|
||||
if ((ioflag & IO_VMIO) &&
|
||||
(LIST_FIRST(&bp->b_dep) == NULL))
|
||||
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
|
||||
(LIST_FIRST(&bp->b_dep) == NULL)) {
|
||||
bp->b_flags |= B_RELBUF;
|
||||
}
|
||||
|
||||
/*
|
||||
* If IO_SYNC each buffer is written synchronously. Otherwise
|
||||
* if we have a severe page deficiency write the buffer
|
||||
* asynchronously. Otherwise try to cluster, and if that
|
||||
* doesn't do it then either do an async write (if O_DIRECT),
|
||||
* or a delayed write (if not).
|
||||
*/
|
||||
if (ioflag & IO_SYNC) {
|
||||
(void)bwrite(bp);
|
||||
} else if (vm_page_count_severe() ||
|
||||
|
@ -544,6 +563,9 @@ WRITE(ap)
|
|||
} else {
|
||||
bawrite(bp);
|
||||
}
|
||||
} else if (ioflag & IO_DIRECT) {
|
||||
bp->b_flags |= B_CLUSTEROK;
|
||||
bawrite(bp);
|
||||
} else {
|
||||
bp->b_flags |= B_CLUSTEROK;
|
||||
bdwrite(bp);
|
||||
|
|
|
@ -1303,6 +1303,29 @@ vm_page_try_to_cache(vm_page_t m)
|
|||
return(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* vm_page_try_to_free()
|
||||
*
|
||||
* Attempt to free the page. If we cannot free it, we do nothing.
|
||||
* 1 is returned on success, 0 on failure.
|
||||
*/
|
||||
int
|
||||
vm_page_try_to_free(m)
|
||||
vm_page_t m;
|
||||
{
|
||||
if (m->dirty || m->hold_count || m->busy || m->wire_count ||
|
||||
(m->flags & (PG_BUSY|PG_UNMANAGED))) {
|
||||
return(0);
|
||||
}
|
||||
vm_page_test_dirty(m);
|
||||
if (m->dirty)
|
||||
return(0);
|
||||
vm_page_busy(m);
|
||||
vm_page_protect(m, VM_PROT_NONE);
|
||||
vm_page_free(m);
|
||||
return(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* vm_page_cache
|
||||
*
|
||||
|
|
|
@ -421,6 +421,7 @@ vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
|
|||
vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
|
||||
void vm_page_cache __P((register vm_page_t));
|
||||
int vm_page_try_to_cache __P((vm_page_t));
|
||||
int vm_page_try_to_free __P((vm_page_t));
|
||||
void vm_page_dontneed __P((register vm_page_t));
|
||||
static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
|
||||
static __inline void vm_page_free __P((vm_page_t));
|
||||
|
|
Loading…
Reference in a new issue