Ensure I/O buffers in libufs(3) are 128-byte aligned.

Various disk controllers require their buffers to be aligned to a
cache-line size (128 bytes). For buffers allocated in structures,
ensure that they are 128-byte aligned. Use aligned_malloc to allocate
memory to ensure that the returned memory is 128-byte aligned.

While we are here, we replace the dynamically allocated inode buffer
with a buffer allocated in the uufsd structure just as the superblock
and cylinder group buffers do.

This can be removed if/when the kernel is fixed. Because this problem
has existed on one I/O subsystem or another since the 1990's, we
are probably stuck with dealing with it forever.

The problem most recent showed up in Azure, see:
    https://reviews.freebsd.org/D41728
    https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=267654
Before these fixes were applied, it was confirmed that the changes
in this commit also fixed the issue in Azure.

Reviewed-by: Warner Losh, kib
Tested-by:   Souradeep Chakrabarti of Microsoft (earlier version)
PR:          267654
Differential Revision: https://reviews.freebsd.org/D41724
This commit is contained in:
Kirk McKusick 2023-11-17 14:10:29 -08:00
parent 415c1c748d
commit 772430dd67
14 changed files with 99 additions and 92 deletions

View File

@ -2,7 +2,7 @@
PACKAGE= ufs
LIB= ufs
SHLIBDIR?= /lib
SHLIB_MAJOR= 7
SHLIB_MAJOR= 8
SRCS= block.c cgroup.c gsb_crc32.c inode.c sblock.c type.c ffs_subr.c
SRCS+= ffs_tables.c

View File

@ -57,19 +57,10 @@ bread(struct uufsd *disk, ufs2_daddr_t blockno, void *data, size_t size)
ERROR(disk, NULL);
p2 = data;
/*
* XXX: various disk controllers require alignment of our buffer
* XXX: which is stricter than struct alignment.
* XXX: Bounce the buffer if not 64 byte aligned.
* XXX: this can be removed if/when the kernel is fixed
*/
if (((intptr_t)data) & 0x3f) {
p2 = malloc(size);
if (p2 == NULL) {
ERROR(disk, "allocate bounce buffer");
goto fail;
}
BUF_MALLOC(&p2, data, size);
if (p2 == NULL) {
ERROR(disk, "allocate bounce buffer");
goto fail;
}
cnt = pread(disk->d_fd, p2, size, (off_t)(blockno * disk->d_bsize));
if (cnt == -1) {
@ -101,7 +92,7 @@ bwrite(struct uufsd *disk, ufs2_daddr_t blockno, const void *data, size_t size)
{
ssize_t cnt;
int rv;
void *p2 = NULL;
void *p2;
ERROR(disk, NULL);
@ -110,24 +101,15 @@ bwrite(struct uufsd *disk, ufs2_daddr_t blockno, const void *data, size_t size)
ERROR(disk, "failed to open disk for writing");
return (-1);
}
/*
* XXX: various disk controllers require alignment of our buffer
* XXX: which is stricter than struct alignment.
* XXX: Bounce the buffer if not 64 byte aligned.
* XXX: this can be removed if/when the kernel is fixed
*/
if (((intptr_t)data) & 0x3f) {
p2 = malloc(size);
if (p2 == NULL) {
ERROR(disk, "allocate bounce buffer");
return (-1);
}
memcpy(p2, data, size);
data = p2;
BUF_MALLOC(&p2, data, size);
if (p2 == NULL) {
ERROR(disk, "allocate bounce buffer");
return (-1);
}
cnt = pwrite(disk->d_fd, data, size, (off_t)(blockno * disk->d_bsize));
if (p2 != NULL)
if (p2 != data)
memcpy(p2, data, size);
cnt = pwrite(disk->d_fd, p2, size, (off_t)(blockno * disk->d_bsize));
if (p2 != data)
free(p2);
if (cnt == -1) {
ERROR(disk, "write error to block device");
@ -137,7 +119,6 @@ bwrite(struct uufsd *disk, ufs2_daddr_t blockno, const void *data, size_t size)
ERROR(disk, "short write to block device");
return (-1);
}
return (cnt);
}

View File

@ -62,18 +62,10 @@ getinode(struct uufsd *disk, union dinodep *dp, ino_t inum)
ERROR(disk, "inode number out of range");
return (-1);
}
inoblock = disk->d_inoblock;
inoblock = (caddr_t)&disk->d_inos[0];
min = disk->d_inomin;
max = disk->d_inomax;
if (inoblock == NULL) {
inoblock = malloc(fs->fs_bsize);
if (inoblock == NULL) {
ERROR(disk, "unable to allocate inode block");
return (-1);
}
disk->d_inoblock = inoblock;
}
if (inum >= min && inum < max)
goto gotit;
bread(disk, fsbtodb(fs, ino_to_fsba(fs, inum)), inoblock,
@ -107,14 +99,10 @@ putinode(struct uufsd *disk)
struct fs *fs;
fs = &disk->d_fs;
if (disk->d_inoblock == NULL) {
ERROR(disk, "No inode block allocated");
return (-1);
}
if (disk->d_ufs == 2)
ffs_update_dinode_ckhash(fs, disk->d_dp.dp2);
if (bwrite(disk, fsbtodb(fs, ino_to_fsba(&disk->d_fs, disk->d_inomin)),
disk->d_inoblock, disk->d_fs.fs_bsize) <= 0)
(caddr_t)&disk->d_inos[0], disk->d_fs.fs_bsize) <= 0)
return (-1);
return (0);
}

View File

@ -30,6 +30,13 @@
#ifndef __LIBUFS_H__
#define __LIBUFS_H__
/*
* Various disk controllers require their buffers to be aligned to the size
* of a cache line. The LIBUFS_BUFALIGN defines the required alignment size.
* The alignment must be a power of 2.
*/
#define LIBUFS_BUFALIGN 128
/*
* libufs structures.
*/
@ -42,39 +49,51 @@ union dinodep {
* userland ufs disk.
*/
struct uufsd {
const char *d_name; /* disk name */
int d_ufs; /* decimal UFS version */
int d_fd; /* raw device file descriptor */
long d_bsize; /* device bsize */
ufs2_daddr_t d_sblock; /* superblock location */
struct fs_summary_info *d_si; /* Superblock summary info */
caddr_t d_inoblock; /* inode block */
uint32_t d_inomin; /* low ino, not ino_t for ABI compat */
uint32_t d_inomax; /* high ino, not ino_t for ABI compat */
union dinodep d_dp; /* pointer to currently active inode */
union {
struct fs d_fs; /* filesystem information */
char d_sb[MAXBSIZE]; /* superblock as buffer */
} d_sbunion;
char d_sb[SBLOCKSIZE]; /* superblock as buffer */
} d_sbunion __aligned(LIBUFS_BUFALIGN);
union {
struct cg d_cg; /* cylinder group */
char d_buf[MAXBSIZE]; /* cylinder group storage */
} d_cgunion;
int d_ccg; /* current cylinder group */
int d_lcg; /* last cylinder group (in d_cg) */
} d_cgunion __aligned(LIBUFS_BUFALIGN);
union {
union dinodep d_ino[1]; /* inode block */
char d_inos[MAXBSIZE]; /* inode block as buffer */
} d_inosunion __aligned(LIBUFS_BUFALIGN);
const char *d_name; /* disk name */
const char *d_error; /* human readable disk error */
ufs2_daddr_t d_sblock; /* superblock location */
struct fs_summary_info *d_si; /* Superblock summary info */
union dinodep d_dp; /* pointer to currently active inode */
ino_t d_inomin; /* low ino */
ino_t d_inomax; /* high ino */
off_t d_sblockloc; /* where to look for the superblock */
int d_lookupflags; /* flags to superblock lookup */
int d_mine; /* internal flags */
#define d_fs d_sbunion.d_fs
#define d_sb d_sbunion.d_sb
#define d_cg d_cgunion.d_cg
int64_t d_bsize; /* device bsize */
int64_t d_lookupflags; /* flags to superblock lookup */
int64_t d_mine; /* internal flags */
int32_t d_ccg; /* current cylinder group */
int32_t d_ufs; /* decimal UFS version */
int32_t d_fd; /* raw device file descriptor */
int32_t d_lcg; /* last cylinder group (in d_cg) */
};
#define d_inos d_inosunion.d_inos
#define d_fs d_sbunion.d_fs
#define d_cg d_cgunion.d_cg
/*
* libufs macros (internal, non-exported).
*/
#ifdef _LIBUFS
/*
* Ensure that the buffer is aligned to the I/O subsystem requirements.
*/
#define BUF_MALLOC(newbufpp, data, size) { \
if (data != NULL && (((intptr_t)data) & (LIBUFS_BUFALIGN - 1)) == 0) \
*newbufpp = (void *)data; \
else \
*newbufpp = aligned_alloc(LIBUFS_BUFALIGN, size); \
}
/*
* Trace steps through libufs, to be used at entry and erroneous return.
*/

View File

@ -228,7 +228,8 @@ use_pread(void *devfd, off_t loc, void **bufp, int size)
int fd;
fd = *(int *)devfd;
if ((*bufp = malloc(size)) == NULL)
BUF_MALLOC(bufp, NULL, size);
if (*bufp == NULL)
return (ENOSPC);
if (pread(fd, *bufp, size, loc) != size)
return (EIO);

View File

@ -61,10 +61,6 @@ ufs_disk_close(struct uufsd *disk)
ERROR(disk, NULL);
close(disk->d_fd);
disk->d_fd = -1;
if (disk->d_inoblock != NULL) {
free(disk->d_inoblock);
disk->d_inoblock = NULL;
}
if (disk->d_mine & MINE_NAME) {
free((char *)(uintptr_t)disk->d_name);
disk->d_name = NULL;
@ -155,10 +151,16 @@ again: if ((ret = stat(name, &st)) < 0) {
return (-1);
}
if (((uintptr_t)disk & ~(LIBUFS_BUFALIGN - 1)) != (uintptr_t)disk) {
ERROR(disk, "uufsd structure must be aligned to "
"LIBUFS_BUFALIGN byte boundry, see ufs_disk_fillout(3)");
close(fd);
return (-1);
}
disk->d_bsize = 1;
disk->d_ccg = 0;
disk->d_fd = fd;
disk->d_inoblock = NULL;
disk->d_inomin = 0;
disk->d_inomax = 0;
disk->d_lcg = 0;

View File

@ -9,7 +9,7 @@
.\"
.\" This file is in the public domain.
.\"
.Dd June 4, 2003
.Dd November 17, 2023
.Dt UFS_DISK_CLOSE 3
.Os
.Sh NAME
@ -51,6 +51,13 @@ functions open a disk specified by
.Fa name
and populate the structure pointed to by
.Fa disk .
The structure referenced by the
.Fa disk
pointer must be aligned to at least the alignment specified by
.Dv LIBUFS_ALIGN
that is defined in the
.Lb libufs.h
header file.
The disk is opened read-only.
The specified
.Fa name

View File

@ -67,6 +67,7 @@
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <libufs.h>
#include <sys/queue.h>
@ -424,6 +425,20 @@ Malloc(size_t size)
break;
return (retval);
}
/*
* Allocate a block of memory to be used as an I/O buffer.
* Ensure that the buffer is aligned to the I/O subsystem requirements.
*/
static inline void*
Balloc(size_t size)
{
void *retval;
while ((retval = aligned_alloc(LIBUFS_BUFALIGN, size)) == NULL)
if (flushentry() == 0)
break;
return (retval);
}
/*
* Wrapper for calloc() that flushes the cylinder group cache to try

View File

@ -58,7 +58,6 @@ static const char sccsid[] = "@(#)utilities.c 8.6 (Berkeley) 5/19/95";
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <libufs.h>
#include "fsck.h"
@ -189,7 +188,7 @@ bufinit(void)
initbarea(&failedbuf, BT_UNKNOWN);
failedbuf.b_errs = -1;
failedbuf.b_un.b_buf = NULL;
if ((cgblk.b_un.b_buf = Malloc((unsigned int)sblock.fs_bsize)) == NULL)
if ((cgblk.b_un.b_buf = Balloc((unsigned int)sblock.fs_bsize)) == NULL)
errx(EEXIT, "Initial malloc(%d) failed", sblock.fs_bsize);
initbarea(&cgblk, BT_CYLGRP);
numbufs = cachelookups = cachereads = 0;
@ -211,7 +210,7 @@ allocbuf(const char *failreason)
char *bufp;
bp = (struct bufarea *)Malloc(sizeof(struct bufarea));
bufp = Malloc((unsigned int)sblock.fs_bsize);
bufp = Balloc((unsigned int)sblock.fs_bsize);
if (bp == NULL || bufp == NULL) {
errx(EEXIT, "%s", failreason);
/* NOTREACHED */
@ -241,7 +240,7 @@ cglookup(int cg)
if ((unsigned) cg >= sblock.fs_ncg)
errx(EEXIT, "cglookup: out of range cylinder group %d", cg);
if (cgbufs == NULL) {
cgbufs = calloc(sblock.fs_ncg, sizeof(struct bufarea));
cgbufs = Calloc(sblock.fs_ncg, sizeof(struct bufarea));
if (cgbufs == NULL)
errx(EEXIT, "Cannot allocate cylinder group buffers");
}
@ -250,7 +249,7 @@ cglookup(int cg)
return (cgbp);
cgp = NULL;
if (flushtries == 0)
cgp = Malloc((unsigned int)sblock.fs_cgsize);
cgp = Balloc((unsigned int)sblock.fs_cgsize);
if (cgp == NULL) {
if (sujrecovery)
errx(EEXIT,"Ran out of memory during journal recovery");
@ -966,7 +965,7 @@ blzero(int fd, ufs2_daddr_t blk, long size)
if (fd < 0)
return;
if (zero == NULL) {
zero = calloc(ZEROBUFSIZE, 1);
zero = Balloc(ZEROBUFSIZE);
if (zero == NULL)
errx(EEXIT, "cannot allocate buffer pool");
}

View File

@ -48,7 +48,6 @@ static const char sccsid[] = "@(#)inode.c 8.8 (Berkeley) 4/28/95";
#include <pwd.h>
#include <string.h>
#include <time.h>
#include <libufs.h>
#include "fsck.h"
@ -646,7 +645,7 @@ setinodebuf(int cg, ino_t inosused)
inobufsize = blkroundup(&sblock,
MAX(INOBUFSIZE, sblock.fs_bsize));
initbarea(&inobuf, BT_INODES);
if ((inobuf.b_un.b_buf = Malloc((unsigned)inobufsize)) == NULL)
if ((inobuf.b_un.b_buf = Balloc((unsigned)inobufsize)) == NULL)
errx(EEXIT, "cannot allocate space for inode buffer");
}
fullcnt = inobufsize / ((sblock.fs_magic == FS_UFS1_MAGIC) ?

View File

@ -59,7 +59,6 @@ static char sccsid[] = "@(#)main.c 8.6 (Berkeley) 5/14/95";
#include <fstab.h>
#include <grp.h>
#include <inttypes.h>
#include <libufs.h>
#include <mntopts.h>
#include <paths.h>
#include <stdint.h>

View File

@ -45,7 +45,6 @@ static const char sccsid[] = "@(#)pass5.c 8.9 (Berkeley) 4/28/95";
#include <inttypes.h>
#include <limits.h>
#include <string.h>
#include <libufs.h>
#include "fsck.h"

View File

@ -52,7 +52,6 @@ static const char sccsid[] = "@(#)setup.c 8.10 (Berkeley) 5/9/95";
#include <limits.h>
#include <stdint.h>
#include <string.h>
#include <libufs.h>
#include "fsck.h"
@ -214,7 +213,7 @@ setup(char *dev)
sbdirty();
}
if (snapcnt > 0 && copybuf == NULL) {
copybuf = Malloc(sblock.fs_bsize);
copybuf = Balloc(sblock.fs_bsize);
if (copybuf == NULL)
errx(EEXIT, "cannot allocate space for snapshot "
"copy buffer");
@ -501,7 +500,7 @@ sblock_init(void)
fsmodified = 0;
lfdir = 0;
initbarea(&sblk, BT_SUPERBLK);
sblk.b_un.b_buf = Malloc(SBLOCKSIZE);
sblk.b_un.b_buf = Balloc(SBLOCKSIZE);
if (sblk.b_un.b_buf == NULL)
errx(EEXIT, "cannot allocate space for superblock");
dev_bsize = secsize = DEV_BSIZE;
@ -530,7 +529,7 @@ calcsb(char *dev, int devfd, struct fs *fs)
*/
if (ioctl(devfd, DIOCGSECTORSIZE, &secsize) == -1)
return (0);
fsrbuf = Malloc(secsize);
fsrbuf = Balloc(secsize);
if (fsrbuf == NULL)
errx(EEXIT, "calcsb: cannot allocate recovery buffer");
if (blread(devfd, fsrbuf,
@ -573,7 +572,7 @@ chkrecovery(int devfd)
rdsize = sblock.fs_fsize;
if (ioctl(devfd, DIOCGSECTORSIZE, &secsize) == -1 ||
rdsize % secsize != 0 ||
(fsrbuf = Malloc(rdsize)) == NULL ||
(fsrbuf = Balloc(rdsize)) == NULL ||
blread(devfd, fsrbuf, (SBLOCK_UFS2 - rdsize) / dev_bsize,
rdsize) != 0) {
free(fsrbuf);
@ -612,7 +611,7 @@ saverecovery(int readfd, int writefd)
if (sblock.fs_magic != FS_UFS2_MAGIC ||
ioctl(readfd, DIOCGSECTORSIZE, &secsize) == -1 ||
rdsize % secsize != 0 ||
(fsrbuf = Malloc(rdsize)) == NULL ||
(fsrbuf = Balloc(rdsize)) == NULL ||
blread(readfd, fsrbuf, (SBLOCK_UFS2 - rdsize) / dev_bsize,
rdsize) != 0) {
printf("RECOVERY DATA COULD NOT BE CREATED\n");

View File

@ -47,7 +47,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <libufs.h>
#include <string.h>
#include <strings.h>
#include <sysexits.h>
@ -2274,7 +2273,7 @@ suj_add_block(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
static void
suj_read(void)
{
uint8_t block[1 * 1024 * 1024];
uint8_t block[1 * 1024 * 1024] __aligned(LIBUFS_BUFALIGN);
struct suj_seg *seg;
struct jsegrec *recn;
struct jsegrec *rec;