[File] Substantial increase in I/O
Mark Hills
mark at xwax.org
Sun Jul 20 19:56:07 UTC 2025
After an upgrade, I got caught out by substantially more I/O from the
"file" command.
It is straightforward to see in Linux strace. The default read() call was
changed from 1Mb to 7Mb with the patch below.
I'm specifically in a low bandwidth environment which is why this is so
obvious and problematic. But a 7x increase in I/O will also negatively
affect anyone working in bulk.
It seems unnecessary when so few bytes are needed to identify most files
(anecdotally, tested using "-P bytes=xxxx")
I had always assumed the command was reading data on-demand, and the magic
algorithm used some kind of hierarchy of matches.
So I experimented (patch below) in doing that. I swapped the buffer from
read() to mmap(), to let the kernel page the file in on-demand.
Except the result of this experiment seems to be little benefit as I watch
the full buffer page in for a simple identification.
I'm out of time for now, so I wanted to share these findings and a
question:
Is scanning the whole buffer fundamntal to the 'magic' algorithm design,
or is it resulting from a few 'difficult' magic rules?
An ugly way would be to re-run the whole algorith with increasing bytes,
until a match is found. But I suspect a better solution exists.
Thanks
--
Mark
---
src/magic.c | 26 +++++++++++++++-----------
1 file changed, 15 insertions(+), 11 deletions(-)
diff --git a/src/magic.c b/src/magic.c
index 89a3606c..20cae8f8 100644
--- a/src/magic.c
+++ b/src/magic.c
@@ -436,8 +436,6 @@ file_or_fd(struct magic_set *ms, const char *inname, int fd)
* some overlapping space for matches near EOF
*/
#define SLOP (1 + sizeof(union VALUETYPE))
- if ((buf = CAST(unsigned char *, malloc(ms->bytes_max + SLOP))) == NULL)
- return NULL;
switch (file_fsmagic(ms, inname, &sb)) {
case -1: /* error */
@@ -519,22 +517,28 @@ file_or_fd(struct magic_set *ms, const char *inname, int fd)
_isatty(fd) ? 8 * 1024 :
#endif
ms->bytes_max;
- if ((nbytes = read(fd, RCAST(void *, buf), howmany)) == -1) {
- if (inname == NULL && fd != STDIN_FILENO)
- file_error(ms, errno, "cannot read fd %d", fd);
- else
- file_error(ms, errno, "cannot read `%s'",
- inname == NULL ? "/dev/stdin" : inname);
- goto done;
+
+ if (!okstat)
+ abort();
+
+ if ((size_t)sb.st_size < howmany)
+ howmany = (size_t)sb.st_size;
+
+ buf = mmap(NULL, howmany, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+ if (buf == (void*)-1) {
+ perror("mmap");
+ abort();
}
+
+ nbytes = howmany;
}
- (void)memset(buf + nbytes, 0, SLOP); /* NUL terminate */
+ (void)memset(buf + nbytes - SLOP, 0, SLOP); /* NUL terminate */
if (file_buffer(ms, fd, okstat ? &sb : NULL, inname, buf, CAST(size_t, nbytes)) == -1)
goto done;
+
rv = 0;
done:
- free(buf);
if (fd != -1) {
if (pos != CAST(off_t, -1))
(void)lseek(fd, pos, SEEK_SET);
--
2.47.2
>From d2659ae455c7df7e8c6355dfaaea1180236a2932 Mon Sep 17 00:00:00 2001
From: Christos Zoulas <christos at zoulas.com>
Date: Sun, 2 Oct 2022 12:53:28 +0000
Subject: [PATCH] Increase the number of bytes we are looking for (Joerg
Jenderek)
---
src/file.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/file.h b/src/file.h
index 359a12a9..8281426f 100644
--- a/src/file.h
+++ b/src/file.h
@@ -27,7 +27,7 @@
*/
/*
* file.h - definitions for file(1) program
- * @(#)$File: file.h,v 1.239 2022/09/24 20:30:13 christos Exp $
+ * @(#)$File: file.h,v 1.240 2022/10/02 12:53:28 christos Exp $
*/
#ifndef __file_h__
@@ -483,8 +483,8 @@ struct magic_set {
size_t bytes_max; /* number of bytes to read from file */
size_t encoding_max; /* bytes to look for encoding */
#ifndef FILE_BYTES_MAX
-# define FILE_BYTES_MAX (1024 * 1024) /* how much of the file to look at */
-#endif
+# define FILE_BYTES_MAX (7 * 1024 * 1024)/* how much of the file to look at */
+#endif /* above 0x6ab0f4 map offset for HelveticaNeue.dfont */
#define FILE_ELF_NOTES_MAX 256
#define FILE_ELF_PHNUM_MAX 2048
#define FILE_ELF_SHNUM_MAX 32768
--
2.47.2
More information about the File
mailing list