LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [RFC] mmaped copy too slow?
@ 2008-01-15 1:45 KOSAKI Motohiro
2008-01-15 2:15 ` Rik van Riel
2008-01-15 12:46 ` Paulo Marques
0 siblings, 2 replies; 9+ messages in thread
From: KOSAKI Motohiro @ 2008-01-15 1:45 UTC (permalink / raw)
To: linux-mm, linux-kernel; +Cc: kosaki.motohiro, Rik van Riel, Andrew Morton
[-- Attachment #1: Type: text/plain, Size: 4479 bytes --]
Hi
at one point, I found the large file copy speed was different depending on
the copy method.
I compared below method
- read(2) and write(2).
- mmap(2) x2 and memcpy.
- mmap(2) and write(2).
in addition, effect of fadvice(2) and madvice(2) is checked.
to a strange thing,
- most faster method is read + write + fadvice.
- worst method is mmap + memcpy.
some famous book(i.e. Advanced Programming in UNIX Environment
by W. Richard Stevens) written mmap copy x2 faster than read-write.
but, linux doesn't.
and, I found bottleneck is page reclaim.
for comparision, I change page reclaim function a bit. and test again.
test machine:
CPU: Pentium4 with HT 2.8GHz
memory: 512M
Disk I/O: can about 20M/s transfer.
(in other word, 1GB transfer need 50s at ideal state)
spent time of 1GB file copy.(unit is second)
2.6.24-rc6 2.6.24-rc6 ratio
+my patch (small is faster)
------------------------------------------------------------
rw_cp 59.32 58.60 98.79%
rw_fadv_cp 57.96 57.96 100.0%
mm_sync_cp 69.97 61.68 88.15%
mm_sync_madv_cp 69.41 62.54 90.10%
mw_cp 61.69 63.11 102.30%
mw_madv_cp 61.35 61.31 99.93%
this patch is too premature and ugly.
but I think that there is enough information to discuss to
page reclaim improvement.
the problem is when almost page is mapped and PTE access bit on,
page reclaim process below steps.
1) page move to inactive list -> active list
2) page move to active list -> inactive list
3) really pageout
It is too roundabout and unnecessary memory pressure happend.
if you don't mind, please discuss.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
mm/vmscan.c | 46 +++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 43 insertions(+), 3 deletions(-)
Index: linux-2.6.24-rc6-cp3/mm/vmscan.c
===================================================================
--- linux-2.6.24-rc6-cp3.orig/mm/vmscan.c 2008-01-13 21:58:03.000000000 +0900
+++ linux-2.6.24-rc6-cp3/mm/vmscan.c 2008-01-13 22:30:27.000000000 +0900
@@ -446,13 +446,18 @@ static unsigned long shrink_page_list(st
struct pagevec freed_pvec;
int pgactivate = 0;
unsigned long nr_reclaimed = 0;
+ unsigned long nr_scanned = 0;
+ LIST_HEAD(l_mapped_pages);
+ unsigned long nr_mapped_page_activate = 0;
+ struct page *page;
+ int reference_checked = 0;
cond_resched();
pagevec_init(&freed_pvec, 1);
+retry:
while (!list_empty(page_list)) {
struct address_space *mapping;
- struct page *page;
int may_enter_fs;
int referenced;
@@ -466,6 +471,7 @@ static unsigned long shrink_page_list(st
VM_BUG_ON(PageActive(page));
+ nr_scanned++;
sc->nr_scanned++;
if (!sc->may_swap && page_mapped(page))
@@ -493,11 +499,17 @@ static unsigned long shrink_page_list(st
goto keep_locked;
}
- referenced = page_referenced(page, 1);
- /* In active use or really unfreeable? Activate it. */
- if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
- referenced && page_mapping_inuse(page))
- goto activate_locked;
+ if (!reference_checked) {
+ referenced = page_referenced(page, 1);
+ /* In active use or really unfreeable? Activate it. */
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
+ referenced && page_mapping_inuse(page)) {
+ nr_mapped_page_activate++;
+ unlock_page(page);
+ list_add(&page->lru, &l_mapped_pages);
+ continue;
+ }
+ }
#ifdef CONFIG_SWAP
/*
@@ -604,7 +616,31 @@ keep:
list_add(&page->lru, &ret_pages);
VM_BUG_ON(PageLRU(page));
}
+
+ if (nr_scanned == nr_mapped_page_activate) {
+ /* may be under copy by mmap.
+ ignore reference flag. */
+ reference_checked = 1;
+ list_splice(&l_mapped_pages, page_list);
+ goto retry;
+ } else {
+ /* move active list just now */
+ while (!list_empty(&l_mapped_pages)) {
+ page = lru_to_page(&l_mapped_pages);
+ list_del(&page->lru);
+ prefetchw_prev_lru_page(page, &l_mapped_pages, flags);
+
+ if (!TestSetPageLocked(page)) {
+ SetPageActive(page);
+ pgactivate++;
+ unlock_page(page);
+ }
+ list_add(&page->lru, &ret_pages);
+ }
+ }
+
list_splice(&ret_pages, page_list);
+
if (pagevec_count(&freed_pvec))
__pagevec_release_nonlru(&freed_pvec);
count_vm_events(PGACTIVATE, pgactivate);
[-- Attachment #2: mmap-write.c --]
[-- Type: application/octet-stream, Size: 1066 bytes --]
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/stat.h>
int main(int argc, char *argv[])
{
if (argc != 3) {
fprintf(stderr, "usage: %s from_file to_file", argv[0]);
exit(0);
}
/* from */
int from = open(argv[1], O_RDONLY, 0644);
assert(from >= 0);
struct stat st_buf;
assert(fstat(from, &st_buf) >= 0);
size_t size = st_buf.st_size;
void *from_mmap = mmap(NULL, size, PROT_READ, MAP_SHARED, from, 0);
assert(from_mmap >= 0);
#if USE_MADVISE
assert(madvise(from_mmap, size, MADV_SEQUENTIAL) >= 0);
#endif
/* to */
int to = open(argv[2], O_CREAT | O_WRONLY, 0666);
assert(to >= 0);
/* copy */
char *p = from_mmap;
const char * const endp = from_mmap + size;
while (p < endp) {
int num_bytes = write(to, p, endp - p);
p += num_bytes;
}
assert(p == endp);
fsync(to);
close(to);
close(from);
return 0;
}
[-- Attachment #3: read-write.c --]
[-- Type: application/octet-stream, Size: 1225 bytes --]
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <linux/fadvise.h>
#define BUF_SIZE 8192
int main(int argc, char **argv)
{
char buf[BUF_SIZE];
struct stat st_buf;
if (argc < 3) {
fprintf(stderr, "usage: %s src out\n", argv[0]);
exit(EXIT_SUCCESS);
}
char *src = argv[1];
char *dest = argv[2];
assert(strcmp(src, dest) != 0);
int srcfd = open(src, O_RDONLY, 0644);
assert(srcfd >= 0);
#if USE_FADVISE
posix_fadvise(srcfd, 0, 0, POSIX_FADV_SEQUENTIAL);
posix_fadvise(srcfd, 0, 0, POSIX_FADV_NOREUSE);
#endif
/* get permission */
assert(fstat(srcfd, &st_buf) >= 0);
int destfd = open(dest, O_WRONLY | O_CREAT, st_buf.st_mode);
assert(destfd >= 0);
int n = 0;
while ((n = read(srcfd, buf, sizeof(buf))) > 0) {
char *p = &buf[0];
const char * const endp = buf + n;
while (p < endp) {
int num_bytes = write(destfd, p, endp - p);
p += num_bytes;
}
}
assert(n == 0);
fsync(destfd);
close(destfd);
close(srcfd);
exit(EXIT_SUCCESS);
}
[-- Attachment #4: test.sh --]
[-- Type: application/octet-stream, Size: 568 bytes --]
#!/bin/zsh -x
SRC=testfile1G
DST=testfile1G2
TIMEX=/usr/bin/time
PREPARE='rm $DST;sync;sync;sync;sudo sh -c "echo 3 > /proc/sys/vm/drop_caches";sleep 1'
REPEAT=1
(repeat $REPEAT (eval $PREPARE; $TIMEX ./rw_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./rw_fadv_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./mm_sync_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./mm_sync_madv_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./mw_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./mw_madv_cp ${SRC} ${DST}))
[-- Attachment #5: Makefile --]
[-- Type: application/octet-stream, Size: 1047 bytes --]
CFLAGS = -Wall -O2 --static
TARGET = rw_cp rw_fadv_cp mm_sync_cp mm_mun_cp mm_sync_madv_cp mm_mun_madv_cp mw_cp mw_madv_cp mm_sync_nocache_cp mm_sync_madv_nocache_cp
all: $(TARGET)
rw_cp: read-write.c
gcc $(CFLAGS) -o rw_cp read-write.c
rw_fadv_cp: read-write.c
gcc $(CFLAGS) -DUSE_FADVISE -o rw_fadv_cp read-write.c
mm_sync_cp: mmap-mmap.c
gcc $(CFLAGS) -DWITH_MSYNC -o mm_sync_cp mmap-mmap.c
mm_sync_nocache_cp: mmap-mmap.c
gcc $(CFLAGS) -DWITH_MSYNC -DUSE_NOCACHE_MEMCPY -o $@ $<
mm_mun_cp: mmap-mmap.c
gcc $(CFLAGS) -DWITH_MUNMAP -o mm_mun_cp mmap-mmap.c
mm_sync_madv_cp: mmap-mmap.c
gcc $(CFLAGS) -DUSE_MADVISE -DWITH_MSYNC -o mm_sync_madv_cp mmap-mmap.c
mm_sync_madv_nocache_cp: mmap-mmap.c
gcc $(CFLAGS) -DWITH_MSYNC -DUSE_NOCACHE_MEMCPY -o $@ $<
mm_mun_madv_cp: mmap-mmap.c
gcc $(CFLAGS) -DUSE_MADVISE -DWITH_MUNMAP -o mm_mun_madv_cp mmap-mmap.c
mw_cp: mmap-write.c
gcc $(CFLAGS) -o mw_cp mmap-write.c
mw_madv_cp: mmap-write.c
gcc $(CFLAGS) -DUSE_MADVISE -o mw_madv_cp mmap-write.c
clean:
-rm *.o
-rm $(TARGET)
[-- Attachment #6: mmap-mmap.c --]
[-- Type: application/octet-stream, Size: 1457 bytes --]
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/errno.h>
int main(int argc, char *argv[])
{
int err;
if (argc != 3) {
fprintf(stderr, "usage: %s from_file to_file", argv[0]);
exit(0);
}
/* from */
int from = open(argv[1], O_RDONLY, 0644);
assert(from >= 0);
struct stat st_buf;
assert(fstat(from, &st_buf) >= 0);
size_t size = st_buf.st_size;
void *from_mmap = mmap(NULL, size, PROT_READ, MAP_SHARED, from, 0);
assert(from_mmap >= 0);
#if USE_MADVISE
err = madvise(from_mmap, size, MADV_SEQUENTIAL);
assert(err >= 0);
#endif
/* to */
int to = open(argv[2], O_CREAT|O_RDWR, st_buf.st_mode);
assert(to >= 0);
int i = 0;
assert(lseek(to, size - sizeof(int), 0L) >= 0);
assert(write(to, (&i), sizeof(int)) == sizeof(int));
errno=0;
void *to_mmap = mmap(NULL, size, PROT_WRITE, MAP_SHARED, to, 0);
assert_perror(errno);
#if USE_MADVISE
errno=0;
err = madvise(to_mmap, size, MADV_SEQUENTIAL);
assert_perror(errno);
#endif
/* copy */
memcpy(to_mmap, from_mmap, size);
#if WITH_MSYNC
assert(msync(to_mmap, size, MS_SYNC) >= 0);
#endif
#if WITH_MUNMAP
assert(munmap(to_mmap, size) >= 0);
#endif
assert(ftruncate(to, size) >= 0);
return 0;
}
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] mmaped copy too slow?
2008-01-15 1:45 [RFC] mmaped copy too slow? KOSAKI Motohiro
@ 2008-01-15 2:15 ` Rik van Riel
2008-01-15 3:20 ` KOSAKI Motohiro
2008-01-15 12:46 ` Paulo Marques
1 sibling, 1 reply; 9+ messages in thread
From: Rik van Riel @ 2008-01-15 2:15 UTC (permalink / raw)
To: KOSAKI Motohiro; +Cc: linux-mm, linux-kernel, kosaki.motohiro, Andrew Morton
On Tue, 15 Jan 2008 10:45:47 +0900
KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:
> the problem is when almost page is mapped and PTE access bit on,
> page reclaim process below steps.
>
> 1) page move to inactive list -> active list
> 2) page move to active list -> inactive list
> 3) really pageout
>
> It is too roundabout and unnecessary memory pressure happend.
> if you don't mind, please discuss.
While being able to deal with used-once mappings in page reclaim
could be a good idea, this would require us to be able to determine
the difference between a page that was accessed once since it was
faulted in and a page that got accessed several times.
That kind of infrastructure could end up adding more overhead than
an immediate reclaim of these streaming mmap pages would save.
Given that page faults have overhead too, it does not surprise me
that read+write is faster than mmap+memcpy.
In threaded applications, page fault overhead will be worse still,
since the TLBs need to be synchronized between CPUs (at least at
reclaim time).
Maybe we should just advise people to use read+write, since it is
faster than mmap+memcpy?
--
All rights reversed.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] mmaped copy too slow?
2008-01-15 2:15 ` Rik van Riel
@ 2008-01-15 3:20 ` KOSAKI Motohiro
2008-01-15 8:57 ` Peter Zijlstra
0 siblings, 1 reply; 9+ messages in thread
From: KOSAKI Motohiro @ 2008-01-15 3:20 UTC (permalink / raw)
To: Rik van Riel; +Cc: kosaki.motohiro, linux-mm, linux-kernel, Andrew Morton
Hi Rik
> While being able to deal with used-once mappings in page reclaim
> could be a good idea, this would require us to be able to determine
> the difference between a page that was accessed once since it was
> faulted in and a page that got accessed several times.
it makes sense that read ahead hit assume used-once mapping, may be.
I will try it.
(may be, i can repost soon)
> Given that page faults have overhead too, it does not surprise me
> that read+write is faster than mmap+memcpy.
>
> In threaded applications, page fault overhead will be worse still,
> since the TLBs need to be synchronized between CPUs (at least at
> reclaim time).
sure.
but current is unnecessary large performance difference.
I hope improvement it because copy by mmapd is very common operation.
> Maybe we should just advise people to use read+write, since it is
> faster than mmap+memcpy?
Time is solved to it :)
thanks!
- kosaki
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] mmaped copy too slow?
2008-01-15 3:20 ` KOSAKI Motohiro
@ 2008-01-15 8:57 ` Peter Zijlstra
2008-01-15 9:03 ` KOSAKI Motohiro
0 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2008-01-15 8:57 UTC (permalink / raw)
To: KOSAKI Motohiro; +Cc: Rik van Riel, linux-mm, linux-kernel, Andrew Morton
On Tue, 2008-01-15 at 12:20 +0900, KOSAKI Motohiro wrote:
> Hi Rik
>
> > While being able to deal with used-once mappings in page reclaim
> > could be a good idea, this would require us to be able to determine
> > the difference between a page that was accessed once since it was
> > faulted in and a page that got accessed several times.
>
> it makes sense that read ahead hit assume used-once mapping, may be.
> I will try it.
I once had a patch that made read-ahead give feedback into page reclaim,
but people didn't like it.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] mmaped copy too slow?
2008-01-15 8:57 ` Peter Zijlstra
@ 2008-01-15 9:03 ` KOSAKI Motohiro
2008-01-15 9:08 ` Peter Zijlstra
0 siblings, 1 reply; 9+ messages in thread
From: KOSAKI Motohiro @ 2008-01-15 9:03 UTC (permalink / raw)
To: Peter Zijlstra
Cc: kosaki.motohiro, Rik van Riel, linux-mm, linux-kernel, Andrew Morton
Hi Peter,
> > > While being able to deal with used-once mappings in page reclaim
> > > could be a good idea, this would require us to be able to determine
> > > the difference between a page that was accessed once since it was
> > > faulted in and a page that got accessed several times.
> >
> > it makes sense that read ahead hit assume used-once mapping, may be.
> > I will try it.
>
> I once had a patch that made read-ahead give feedback into page reclaim,
> but people didn't like it.
Could you please tell me your mail subject or URL?
I hope know why people didn't like.
thanks
- kosaki
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] mmaped copy too slow?
2008-01-15 9:03 ` KOSAKI Motohiro
@ 2008-01-15 9:08 ` Peter Zijlstra
0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2008-01-15 9:08 UTC (permalink / raw)
To: KOSAKI Motohiro; +Cc: Rik van Riel, linux-mm, linux-kernel, Andrew Morton
On Tue, 2008-01-15 at 18:03 +0900, KOSAKI Motohiro wrote:
> Hi Peter,
>
> > > > While being able to deal with used-once mappings in page reclaim
> > > > could be a good idea, this would require us to be able to determine
> > > > the difference between a page that was accessed once since it was
> > > > faulted in and a page that got accessed several times.
> > >
> > > it makes sense that read ahead hit assume used-once mapping, may be.
> > > I will try it.
> >
> > I once had a patch that made read-ahead give feedback into page reclaim,
> > but people didn't like it.
>
> Could you please tell me your mail subject or URL?
> I hope know why people didn't like.
I think this is the last thread on the subject:
http://lkml.org/lkml/2007/7/21/219
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] mmaped copy too slow?
2008-01-15 1:45 [RFC] mmaped copy too slow? KOSAKI Motohiro
2008-01-15 2:15 ` Rik van Riel
@ 2008-01-15 12:46 ` Paulo Marques
2008-01-16 2:05 ` KOSAKI Motohiro
1 sibling, 1 reply; 9+ messages in thread
From: Paulo Marques @ 2008-01-15 12:46 UTC (permalink / raw)
To: KOSAKI Motohiro; +Cc: linux-mm, linux-kernel, Rik van Riel, Andrew Morton
KOSAKI Motohiro wrote:
> Hi
>
> at one point, I found the large file copy speed was different depending on
> the copy method.
>
> I compared below method
> - read(2) and write(2).
> - mmap(2) x2 and memcpy.
> - mmap(2) and write(2).
>
> in addition, effect of fadvice(2) and madvice(2) is checked.
>
> to a strange thing,
> - most faster method is read + write + fadvice.
> - worst method is mmap + memcpy.
One thing you could also try is to pass MAP_POPULATE to mmap so that the
page tables are filled in at the time of the mmap, avoiding a lot of
page faults later.
Just my 2 cents,
--
Paulo Marques - www.grupopie.com
"All I ask is a chance to prove that money can't make me happy."
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] mmaped copy too slow?
2008-01-15 12:46 ` Paulo Marques
@ 2008-01-16 2:05 ` KOSAKI Motohiro
2008-01-17 3:23 ` KOSAKI Motohiro
0 siblings, 1 reply; 9+ messages in thread
From: KOSAKI Motohiro @ 2008-01-16 2:05 UTC (permalink / raw)
To: Paulo Marques
Cc: kosaki.motohiro, linux-mm, linux-kernel, Rik van Riel, Andrew Morton
Hi Paulo
> One thing you could also try is to pass MAP_POPULATE to mmap so that the
> page tables are filled in at the time of the mmap, avoiding a lot of
> page faults later.
>
> Just my 2 cents,
OK, I will test your idea and report about tomorrow.
but I don't think page fault is major performance impact.
may be, below 2 things too big
- stupid page reclaim
- large cache pollution by memcpy.
Just my 2 cents :-p
- kosaki
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] mmaped copy too slow?
2008-01-16 2:05 ` KOSAKI Motohiro
@ 2008-01-17 3:23 ` KOSAKI Motohiro
0 siblings, 0 replies; 9+ messages in thread
From: KOSAKI Motohiro @ 2008-01-17 3:23 UTC (permalink / raw)
To: Paulo Marques
Cc: kosaki.motohiro, linux-mm, linux-kernel, Rik van Riel, Andrew Morton
Hi
> > One thing you could also try is to pass MAP_POPULATE to mmap so that the
> > page tables are filled in at the time of the mmap, avoiding a lot of
> > page faults later.
> >
>
> OK, I will test your idea and report about tomorrow.
> but I don't think page fault is major performance impact.
I got more interesting result :)
MAP_POPULATE is harmful result at large copy.
1G copy
elapse(sec)
--------------------------------------------
mmap 71.54
mmap + madvice 69.63
mmap + populate 100.87
mmap + populate + madvice 101.16
more detail:
time command output of mmap copy
0.50user 3.59system 1:11.54elapsed 5%CPU (0avgtext+0avgdata 0maxresident)k
2101192inputs+2097160outputs (32776major+491573minor)pagefaults 0swaps
time command output of mmap+populate copy
0.53user 5.13system 1:40.87elapsed 5%CPU (0avgtext+0avgdata 0maxresident)k
4200808inputs+2097160outputs (49164major+737340minor)pagefaults 0swaps
input blocks increase about x2.
in fact, mmap(MAP_POPULATE) read disk to memory and drop it just after,
thus read again.
of cource, when copy file size is enough small, MAP_POPULATE is effective.
100M copy
elapse(sec)
--------------------------------------------
mmap 7.38
mmap + madvice 7.29
mmap + populate 7.13
mmap + populate + madvice 6.65
- kosaki
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2008-01-17 3:24 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-01-15 1:45 [RFC] mmaped copy too slow? KOSAKI Motohiro
2008-01-15 2:15 ` Rik van Riel
2008-01-15 3:20 ` KOSAKI Motohiro
2008-01-15 8:57 ` Peter Zijlstra
2008-01-15 9:03 ` KOSAKI Motohiro
2008-01-15 9:08 ` Peter Zijlstra
2008-01-15 12:46 ` Paulo Marques
2008-01-16 2:05 ` KOSAKI Motohiro
2008-01-17 3:23 ` KOSAKI Motohiro
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).