LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH v2 1/3] lib: find_*_bit reimplementation
@ 2015-01-31 20:58 yury.norov
2015-01-31 20:58 ` [PATCH v2 2/3] lib: move find_last_bit to lib/find_next_bit.c yury.norov
` (3 more replies)
0 siblings, 4 replies; 14+ messages in thread
From: yury.norov @ 2015-01-31 20:58 UTC (permalink / raw)
To: klimov.linux, davem, akpm, hannes, dborkman, laijs,
takahiro.akashi, valentinrothberg, linux, msalter, chris, tgraf
Cc: linux-kernel, Yury Norov, Yury Norov
From: Yury Norov <y.norov@samsung.com>
New implementations takes less space in source file (see diffstat)
and in object. For me it's 710 vs 453 bytes of text.
Patch was boot-tested on x86_64 and MIPS (big-endian) machines.
Performance tests were ran on userspace with code like this:
/* addr[] is filled from /dev/urandom */
start = clock();
while (ret < nbits)
ret = find_next_bit(addr, nbits, ret + 1);
end = clock();
printf("%ld\t", (unsigned long) end - start);
On Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz rezults are next:
(for find_next_bit, nbits is 8M, for find_first_bit - 80K)
find_next_bit: find_first_bit:
new current new current
26932 43151 14777 14925
26947 43182 14521 15423
26507 43824 15053 14705
27329 43759 14473 14777
26895 43367 14847 15023
26990 43693 15103 15163
26775 43299 15067 15232
27282 42752 14544 15121
27504 43088 14644 14858
26761 43856 14699 15193
26692 43075 14781 14681
27137 42969 14451 15061
... ...
find_next_bit performance gain is 35-40%;
find_first_bit - no measurable difference.
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
lib/find_last_bit.c | 31 ++-----
lib/find_next_bit.c | 254 +++++++++++++++-------------------------------------
2 files changed, 79 insertions(+), 206 deletions(-)
diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
index 91ca09f..e67e970 100644
--- a/lib/find_last_bit.c
+++ b/lib/find_last_bit.c
@@ -4,44 +4,29 @@
* Written by Rusty Russell <rusty@rustcorp.com.au>
* (Inspired by David Howell's find_next_bit implementation)
*
+ * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
+ * size and improve performance, 2015.
+ *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/bitops.h>
#include <linux/export.h>
-#include <asm/types.h>
-#include <asm/byteorder.h>
+#include <linux/kernel.h>
#ifndef find_last_bit
unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
{
- unsigned long words;
- unsigned long tmp;
-
- /* Start at final word. */
- words = size / BITS_PER_LONG;
-
- /* Partial final word? */
- if (size & (BITS_PER_LONG-1)) {
- tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
- - (size & (BITS_PER_LONG-1)))));
- if (tmp)
- goto found;
- }
+ unsigned long idx = DIV_ROUND_UP(size, BITS_PER_LONG);
- while (words) {
- tmp = addr[--words];
- if (tmp) {
-found:
- return words * BITS_PER_LONG + __fls(tmp);
- }
+ while (idx--) {
+ if (addr[idx])
+ return min(idx * BITS_PER_LONG + __fls(addr[idx]), size);
}
- /* Not found */
return size;
}
EXPORT_SYMBOL(find_last_bit);
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
index 0cbfc0b..ebfb3dc 100644
--- a/lib/find_next_bit.c
+++ b/lib/find_next_bit.c
@@ -3,18 +3,45 @@
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
+ * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
+ * size and improve performance, 2015.
+ *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/bitops.h>
#include <linux/export.h>
-#include <asm/types.h>
-#include <asm/byteorder.h>
+#include <linux/kernel.h>
+
+#define HIGH_BITS_MASK(nr) (ULONG_MAX << (nr))
+
+#if !defined(find_next_bit) || !defined(find_next_zero_bit)
+static unsigned long _find_next_bit(const unsigned long *addr,
+ unsigned long nbits, unsigned long start, bool set)
+{
+ unsigned long tmp = set ? addr[start / BITS_PER_LONG]
+ : ~addr[start / BITS_PER_LONG];
+
+ /* Handle 1st word. */
+ if (!IS_ALIGNED(start, BITS_PER_LONG)) {
+ tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);
+ start = round_down(start, BITS_PER_LONG);
+ }
-#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+ while (!tmp) {
+ start += BITS_PER_LONG;
+ if (start >= nbits)
+ return nbits;
+
+ tmp = set ? addr[start / BITS_PER_LONG]
+ : ~addr[start / BITS_PER_LONG];
+ }
+
+ return start + __ffs(tmp);
+}
+#endif
#ifndef find_next_bit
/*
@@ -23,86 +50,22 @@
unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
unsigned long offset)
{
- const unsigned long *p = addr + BITOP_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG-1);
- unsigned long tmp;
-
if (offset >= size)
return size;
- size -= result;
- offset %= BITS_PER_LONG;
- if (offset) {
- tmp = *(p++);
- tmp &= (~0UL << offset);
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
- while (size & ~(BITS_PER_LONG-1)) {
- if ((tmp = *(p++)))
- goto found_middle;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = *p;
-found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __ffs(tmp);
+ return min(_find_next_bit(addr, size, offset, 1), size);
}
EXPORT_SYMBOL(find_next_bit);
#endif
#ifndef find_next_zero_bit
-/*
- * This implementation of find_{first,next}_zero_bit was stolen from
- * Linus' asm-alpha/bitops.h.
- */
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
unsigned long offset)
{
- const unsigned long *p = addr + BITOP_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG-1);
- unsigned long tmp;
-
if (offset >= size)
return size;
- size -= result;
- offset %= BITS_PER_LONG;
- if (offset) {
- tmp = *(p++);
- tmp |= ~0UL >> (BITS_PER_LONG - offset);
- if (size < BITS_PER_LONG)
- goto found_first;
- if (~tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
- while (size & ~(BITS_PER_LONG-1)) {
- if (~(tmp = *(p++)))
- goto found_middle;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = *p;
-found_first:
- tmp |= ~0UL << size;
- if (tmp == ~0UL) /* Are any bits zero? */
- return result + size; /* Nope. */
-found_middle:
- return result + ffz(tmp);
+ return min(_find_next_bit(addr, size, offset, 0), size);
}
EXPORT_SYMBOL(find_next_zero_bit);
#endif
@@ -113,24 +76,14 @@ EXPORT_SYMBOL(find_next_zero_bit);
*/
unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
{
- const unsigned long *p = addr;
- unsigned long result = 0;
- unsigned long tmp;
+ unsigned long idx;
- while (size & ~(BITS_PER_LONG-1)) {
- if ((tmp = *(p++)))
- goto found;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
+ for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+ if (addr[idx])
+ return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
}
- if (!size)
- return result;
- tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found:
- return result + __ffs(tmp);
+ return size;
}
EXPORT_SYMBOL(find_first_bit);
#endif
@@ -141,24 +94,14 @@ EXPORT_SYMBOL(find_first_bit);
*/
unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
- const unsigned long *p = addr;
- unsigned long result = 0;
- unsigned long tmp;
+ unsigned long idx;
- while (size & ~(BITS_PER_LONG-1)) {
- if (~(tmp = *(p++)))
- goto found;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
+ for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+ if (addr[idx] != ULONG_MAX)
+ return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
}
- if (!size)
- return result;
- tmp = (*p) | (~0UL << size);
- if (tmp == ~0UL) /* Are any bits zero? */
- return result + size; /* Nope. */
-found:
- return result + ffz(tmp);
+ return size;
}
EXPORT_SYMBOL(find_first_zero_bit);
#endif
@@ -166,18 +109,6 @@ EXPORT_SYMBOL(find_first_zero_bit);
#ifdef __BIG_ENDIAN
/* include/linux/byteorder does not support "unsigned long" type */
-static inline unsigned long ext2_swabp(const unsigned long * x)
-{
-#if BITS_PER_LONG == 64
- return (unsigned long) __swab64p((u64 *) x);
-#elif BITS_PER_LONG == 32
- return (unsigned long) __swab32p((u32 *) x);
-#else
-#error BITS_PER_LONG not defined
-#endif
-}
-
-/* include/linux/byteorder doesn't support "unsigned long" type */
static inline unsigned long ext2_swab(const unsigned long y)
{
#if BITS_PER_LONG == 64
@@ -189,48 +120,40 @@ static inline unsigned long ext2_swab(const unsigned long y)
#endif
}
+#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
+static unsigned long _find_next_bit_le(const unsigned long *addr,
+ unsigned long nbits, unsigned long start, bool set)
+{
+ unsigned long tmp = set ? addr[start / BITS_PER_LONG]
+ : ~addr[start / BITS_PER_LONG];
+
+ /* Handle 1st word. */
+ if (!IS_ALIGNED(start, BITS_PER_LONG)) {
+ tmp &= ext2_swab(HIGH_BITS_MASK(start % BITS_PER_LONG));
+ start = round_down(start, BITS_PER_LONG);
+ }
+
+ while (!tmp) {
+ start += BITS_PER_LONG;
+ if (start >= nbits)
+ return nbits;
+
+ tmp = set ? addr[start / BITS_PER_LONG]
+ : ~addr[start / BITS_PER_LONG];
+ }
+
+ return start + __ffs(ext2_swab(tmp));
+}
+#endif
+
#ifndef find_next_zero_bit_le
unsigned long find_next_zero_bit_le(const void *addr, unsigned
long size, unsigned long offset)
{
- const unsigned long *p = addr;
- unsigned long result = offset & ~(BITS_PER_LONG - 1);
- unsigned long tmp;
-
if (offset >= size)
return size;
- p += BITOP_WORD(offset);
- size -= result;
- offset &= (BITS_PER_LONG - 1UL);
- if (offset) {
- tmp = ext2_swabp(p++);
- tmp |= (~0UL >> (BITS_PER_LONG - offset));
- if (size < BITS_PER_LONG)
- goto found_first;
- if (~tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
- while (size & ~(BITS_PER_LONG - 1)) {
- if (~(tmp = *(p++)))
- goto found_middle_swap;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = ext2_swabp(p);
-found_first:
- tmp |= ~0UL << size;
- if (tmp == ~0UL) /* Are any bits zero? */
- return result + size; /* Nope. Skip ffz */
-found_middle:
- return result + ffz(tmp);
-
-found_middle_swap:
- return result + ffz(ext2_swab(tmp));
+ return min(_find_next_bit_le(addr, size, offset, 0), size);
}
EXPORT_SYMBOL(find_next_zero_bit_le);
#endif
@@ -239,45 +162,10 @@ EXPORT_SYMBOL(find_next_zero_bit_le);
unsigned long find_next_bit_le(const void *addr, unsigned
long size, unsigned long offset)
{
- const unsigned long *p = addr;
- unsigned long result = offset & ~(BITS_PER_LONG - 1);
- unsigned long tmp;
-
if (offset >= size)
return size;
- p += BITOP_WORD(offset);
- size -= result;
- offset &= (BITS_PER_LONG - 1UL);
- if (offset) {
- tmp = ext2_swabp(p++);
- tmp &= (~0UL << offset);
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
-
- while (size & ~(BITS_PER_LONG - 1)) {
- tmp = *(p++);
- if (tmp)
- goto found_middle_swap;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = ext2_swabp(p);
-found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __ffs(tmp);
-found_middle_swap:
- return result + __ffs(ext2_swab(tmp));
+ return min(_find_next_bit_le(addr, size, offset, 1), size);
}
EXPORT_SYMBOL(find_next_bit_le);
#endif
--
2.1.0
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 2/3] lib: move find_last_bit to lib/find_next_bit.c
2015-01-31 20:58 [PATCH v2 1/3] lib: find_*_bit reimplementation yury.norov
@ 2015-01-31 20:58 ` yury.norov
2015-01-31 20:58 ` [PATCH v2 3/3] lib: rename lib/find_next_bit.c to lib/find_bit.c yury.norov
` (2 subsequent siblings)
3 siblings, 0 replies; 14+ messages in thread
From: yury.norov @ 2015-01-31 20:58 UTC (permalink / raw)
To: klimov.linux, davem, akpm, hannes, dborkman, laijs,
takahiro.akashi, valentinrothberg, linux, msalter, chris, tgraf
Cc: linux-kernel, Yury Norov
From: Yury Norov <yury.norov@gmail.com>
Currently all 'find_*_bit' family is located in lib/find_next_bit.c,
except 'find_last_bit', which is in lib/find_last_bit.c. It seems,
there's no major benefit to have it separated.
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
lib/Makefile | 2 +-
lib/find_last_bit.c | 34 ----------------------------------
lib/find_next_bit.c | 19 +++++++++++++++++++
3 files changed, 20 insertions(+), 35 deletions(-)
delete mode 100644 lib/find_last_bit.c
diff --git a/lib/Makefile b/lib/Makefile
index 3c3b30b..13990aa 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ obj-y += lockref.o
obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
- bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
+ bsearch.o find_next_bit.o llist.o memweight.o kfifo.o \
percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
deleted file mode 100644
index e67e970..0000000
--- a/lib/find_last_bit.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/* find_last_bit.c: fallback find next bit implementation
- *
- * Copyright (C) 2008 IBM Corporation
- * Written by Rusty Russell <rusty@rustcorp.com.au>
- * (Inspired by David Howell's find_next_bit implementation)
- *
- * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
- * size and improve performance, 2015.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/export.h>
-#include <linux/kernel.h>
-
-#ifndef find_last_bit
-
-unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
-{
- unsigned long idx = DIV_ROUND_UP(size, BITS_PER_LONG);
-
- while (idx--) {
- if (addr[idx])
- return min(idx * BITS_PER_LONG + __fls(addr[idx]), size);
- }
-
- return size;
-}
-EXPORT_SYMBOL(find_last_bit);
-
-#endif
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
index ebfb3dc..2087839 100644
--- a/lib/find_next_bit.c
+++ b/lib/find_next_bit.c
@@ -3,6 +3,10 @@
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
+ * Copyright (C) 2008 IBM Corporation
+ * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
+ * (Inspired by David Howell's find_next_bit implementation)
+ *
* Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
* size and improve performance, 2015.
*
@@ -106,6 +110,21 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
EXPORT_SYMBOL(find_first_zero_bit);
#endif
+#ifndef find_last_bit
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+ unsigned long idx = DIV_ROUND_UP(size, BITS_PER_LONG);
+
+ while (idx--) {
+ if (addr[idx])
+ return min(idx * BITS_PER_LONG + __fls(addr[idx]), size);
+ }
+
+ return size;
+}
+EXPORT_SYMBOL(find_last_bit);
+#endif
+
#ifdef __BIG_ENDIAN
/* include/linux/byteorder does not support "unsigned long" type */
--
2.1.0
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 3/3] lib: rename lib/find_next_bit.c to lib/find_bit.c
2015-01-31 20:58 [PATCH v2 1/3] lib: find_*_bit reimplementation yury.norov
2015-01-31 20:58 ` [PATCH v2 2/3] lib: move find_last_bit to lib/find_next_bit.c yury.norov
@ 2015-01-31 20:58 ` yury.norov
2015-02-02 11:09 ` Rasmus Villemoes
2015-02-02 3:17 ` [PATCH v2 1/3] lib: find_*_bit reimplementation George Spelvin
2015-02-02 10:43 ` Rasmus Villemoes
3 siblings, 1 reply; 14+ messages in thread
From: yury.norov @ 2015-01-31 20:58 UTC (permalink / raw)
To: klimov.linux, davem, akpm, hannes, dborkman, laijs,
takahiro.akashi, valentinrothberg, linux, msalter, chris, tgraf
Cc: linux-kernel, Yury Norov
From: Yury Norov <yury.norov@gmail.com>
This file contains implementation for:
- find_last_bit;
- find_first_zero_bit;
- find_first_bit;
- find_next_zero_bit;
- find_next_bit.
So giving more generic name looks reasonable.
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
lib/Makefile | 2 +-
lib/find_bit.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/find_next_bit.c | 192 ---------------------------------------------------
3 files changed, 195 insertions(+), 193 deletions(-)
create mode 100644 lib/find_bit.c
delete mode 100644 lib/find_next_bit.c
diff --git a/lib/Makefile b/lib/Makefile
index 13990aa..1cc93f4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ obj-y += lockref.o
obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
- bsearch.o find_next_bit.o llist.o memweight.o kfifo.o \
+ bsearch.o find_bit.o llist.o memweight.o kfifo.o \
percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
diff --git a/lib/find_bit.c b/lib/find_bit.c
new file mode 100644
index 0000000..236a850
--- /dev/null
+++ b/lib/find_bit.c
@@ -0,0 +1,194 @@
+/* find_bit.c: generic implementation fore:
+ * find_last_bit; find_first_zero_bit; find_first_bit;
+ * find_next_zero_bit; find_next_bit.
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
+ * (Inspired by David Howell's find_next_bit implementation)
+ *
+ * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
+ * size and improve performance, 2015.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+
+#define HIGH_BITS_MASK(nr) (ULONG_MAX << (nr))
+
+#if !defined(find_next_bit) || !defined(find_next_zero_bit)
+static unsigned long _find_next_bit(const unsigned long *addr,
+ unsigned long nbits, unsigned long start, bool set)
+{
+ unsigned long tmp = set ? addr[start / BITS_PER_LONG]
+ : ~addr[start / BITS_PER_LONG];
+
+ /* Handle 1st word. */
+ if (!IS_ALIGNED(start, BITS_PER_LONG)) {
+ tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);
+ start = round_down(start, BITS_PER_LONG);
+ }
+
+ while (!tmp) {
+ start += BITS_PER_LONG;
+ if (start >= nbits)
+ return nbits;
+
+ tmp = set ? addr[start / BITS_PER_LONG]
+ : ~addr[start / BITS_PER_LONG];
+ }
+
+ return start + __ffs(tmp);
+}
+#endif
+
+#ifndef find_next_bit
+/*
+ * Find the next set bit in a memory region.
+ */
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ if (offset >= size)
+ return size;
+
+ return min(_find_next_bit(addr, size, offset, 1), size);
+}
+EXPORT_SYMBOL(find_next_bit);
+#endif
+
+#ifndef find_next_zero_bit
+unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ if (offset >= size)
+ return size;
+
+ return min(_find_next_bit(addr, size, offset, 0), size);
+}
+EXPORT_SYMBOL(find_next_zero_bit);
+#endif
+
+#ifndef find_first_bit
+/*
+ * Find the first set bit in a memory region.
+ */
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+ unsigned long idx;
+
+ for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+ if (addr[idx])
+ return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
+ }
+
+ return size;
+}
+EXPORT_SYMBOL(find_first_bit);
+#endif
+
+#ifndef find_first_zero_bit
+/*
+ * Find the first cleared bit in a memory region.
+ */
+unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+ unsigned long idx;
+
+ for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+ if (addr[idx] != ULONG_MAX)
+ return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
+ }
+
+ return size;
+}
+EXPORT_SYMBOL(find_first_zero_bit);
+#endif
+
+#ifndef find_last_bit
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+ unsigned long idx = DIV_ROUND_UP(size, BITS_PER_LONG);
+
+ while (idx--) {
+ if (addr[idx])
+ return min(idx * BITS_PER_LONG + __fls(addr[idx]), size);
+ }
+
+ return size;
+}
+EXPORT_SYMBOL(find_last_bit);
+#endif
+
+#ifdef __BIG_ENDIAN
+
+/* include/linux/byteorder does not support "unsigned long" type */
+static inline unsigned long ext2_swab(const unsigned long y)
+{
+#if BITS_PER_LONG == 64
+ return (unsigned long) __swab64((u64) y);
+#elif BITS_PER_LONG == 32
+ return (unsigned long) __swab32((u32) y);
+#else
+#error BITS_PER_LONG not defined
+#endif
+}
+
+#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
+static unsigned long _find_next_bit_le(const unsigned long *addr,
+ unsigned long nbits, unsigned long start, bool set)
+{
+ unsigned long tmp = set ? addr[start / BITS_PER_LONG]
+ : ~addr[start / BITS_PER_LONG];
+
+ /* Handle 1st word. */
+ if (!IS_ALIGNED(start, BITS_PER_LONG)) {
+ tmp &= ext2_swab(HIGH_BITS_MASK(start % BITS_PER_LONG));
+ start = round_down(start, BITS_PER_LONG);
+ }
+
+ while (!tmp) {
+ start += BITS_PER_LONG;
+ if (start >= nbits)
+ return nbits;
+
+ tmp = set ? addr[start / BITS_PER_LONG]
+ : ~addr[start / BITS_PER_LONG];
+ }
+
+ return start + __ffs(ext2_swab(tmp));
+}
+#endif
+
+#ifndef find_next_zero_bit_le
+unsigned long find_next_zero_bit_le(const void *addr, unsigned
+ long size, unsigned long offset)
+{
+ if (offset >= size)
+ return size;
+
+ return min(_find_next_bit_le(addr, size, offset, 0), size);
+}
+EXPORT_SYMBOL(find_next_zero_bit_le);
+#endif
+
+#ifndef find_next_bit_le
+unsigned long find_next_bit_le(const void *addr, unsigned
+ long size, unsigned long offset)
+{
+ if (offset >= size)
+ return size;
+
+ return min(_find_next_bit_le(addr, size, offset, 1), size);
+}
+EXPORT_SYMBOL(find_next_bit_le);
+#endif
+
+#endif /* __BIG_ENDIAN */
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
deleted file mode 100644
index 2087839..0000000
--- a/lib/find_next_bit.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/* find_next_bit.c: fallback find next bit implementation
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * Copyright (C) 2008 IBM Corporation
- * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
- * (Inspired by David Howell's find_next_bit implementation)
- *
- * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
- * size and improve performance, 2015.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/export.h>
-#include <linux/kernel.h>
-
-#define HIGH_BITS_MASK(nr) (ULONG_MAX << (nr))
-
-#if !defined(find_next_bit) || !defined(find_next_zero_bit)
-static unsigned long _find_next_bit(const unsigned long *addr,
- unsigned long nbits, unsigned long start, bool set)
-{
- unsigned long tmp = set ? addr[start / BITS_PER_LONG]
- : ~addr[start / BITS_PER_LONG];
-
- /* Handle 1st word. */
- if (!IS_ALIGNED(start, BITS_PER_LONG)) {
- tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);
- start = round_down(start, BITS_PER_LONG);
- }
-
- while (!tmp) {
- start += BITS_PER_LONG;
- if (start >= nbits)
- return nbits;
-
- tmp = set ? addr[start / BITS_PER_LONG]
- : ~addr[start / BITS_PER_LONG];
- }
-
- return start + __ffs(tmp);
-}
-#endif
-
-#ifndef find_next_bit
-/*
- * Find the next set bit in a memory region.
- */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
- unsigned long offset)
-{
- if (offset >= size)
- return size;
-
- return min(_find_next_bit(addr, size, offset, 1), size);
-}
-EXPORT_SYMBOL(find_next_bit);
-#endif
-
-#ifndef find_next_zero_bit
-unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
- unsigned long offset)
-{
- if (offset >= size)
- return size;
-
- return min(_find_next_bit(addr, size, offset, 0), size);
-}
-EXPORT_SYMBOL(find_next_zero_bit);
-#endif
-
-#ifndef find_first_bit
-/*
- * Find the first set bit in a memory region.
- */
-unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
-{
- unsigned long idx;
-
- for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
- if (addr[idx])
- return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
- }
-
- return size;
-}
-EXPORT_SYMBOL(find_first_bit);
-#endif
-
-#ifndef find_first_zero_bit
-/*
- * Find the first cleared bit in a memory region.
- */
-unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
-{
- unsigned long idx;
-
- for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
- if (addr[idx] != ULONG_MAX)
- return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
- }
-
- return size;
-}
-EXPORT_SYMBOL(find_first_zero_bit);
-#endif
-
-#ifndef find_last_bit
-unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
-{
- unsigned long idx = DIV_ROUND_UP(size, BITS_PER_LONG);
-
- while (idx--) {
- if (addr[idx])
- return min(idx * BITS_PER_LONG + __fls(addr[idx]), size);
- }
-
- return size;
-}
-EXPORT_SYMBOL(find_last_bit);
-#endif
-
-#ifdef __BIG_ENDIAN
-
-/* include/linux/byteorder does not support "unsigned long" type */
-static inline unsigned long ext2_swab(const unsigned long y)
-{
-#if BITS_PER_LONG == 64
- return (unsigned long) __swab64((u64) y);
-#elif BITS_PER_LONG == 32
- return (unsigned long) __swab32((u32) y);
-#else
-#error BITS_PER_LONG not defined
-#endif
-}
-
-#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
-static unsigned long _find_next_bit_le(const unsigned long *addr,
- unsigned long nbits, unsigned long start, bool set)
-{
- unsigned long tmp = set ? addr[start / BITS_PER_LONG]
- : ~addr[start / BITS_PER_LONG];
-
- /* Handle 1st word. */
- if (!IS_ALIGNED(start, BITS_PER_LONG)) {
- tmp &= ext2_swab(HIGH_BITS_MASK(start % BITS_PER_LONG));
- start = round_down(start, BITS_PER_LONG);
- }
-
- while (!tmp) {
- start += BITS_PER_LONG;
- if (start >= nbits)
- return nbits;
-
- tmp = set ? addr[start / BITS_PER_LONG]
- : ~addr[start / BITS_PER_LONG];
- }
-
- return start + __ffs(ext2_swab(tmp));
-}
-#endif
-
-#ifndef find_next_zero_bit_le
-unsigned long find_next_zero_bit_le(const void *addr, unsigned
- long size, unsigned long offset)
-{
- if (offset >= size)
- return size;
-
- return min(_find_next_bit_le(addr, size, offset, 0), size);
-}
-EXPORT_SYMBOL(find_next_zero_bit_le);
-#endif
-
-#ifndef find_next_bit_le
-unsigned long find_next_bit_le(const void *addr, unsigned
- long size, unsigned long offset)
-{
- if (offset >= size)
- return size;
-
- return min(_find_next_bit_le(addr, size, offset, 1), size);
-}
-EXPORT_SYMBOL(find_next_bit_le);
-#endif
-
-#endif /* __BIG_ENDIAN */
--
2.1.0
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 1/3] lib: find_*_bit reimplementation
2015-01-31 20:58 [PATCH v2 1/3] lib: find_*_bit reimplementation yury.norov
2015-01-31 20:58 ` [PATCH v2 2/3] lib: move find_last_bit to lib/find_next_bit.c yury.norov
2015-01-31 20:58 ` [PATCH v2 3/3] lib: rename lib/find_next_bit.c to lib/find_bit.c yury.norov
@ 2015-02-02 3:17 ` George Spelvin
2015-02-04 23:07 ` Yury
2015-02-02 10:43 ` Rasmus Villemoes
3 siblings, 1 reply; 14+ messages in thread
From: George Spelvin @ 2015-02-02 3:17 UTC (permalink / raw)
To: akpm, chris, davem, dborkman, hannes, klimov.linux, laijs, linux,
msalter, takahiro.akashi, tgraf, valentinrothberg, yury.norov
Cc: linux-kernel, y.norov
Yury Norov <y.norov@samsung.com> wrote:
> New implementations takes less space in source file (see diffstat)
> and in object. For me it's 710 vs 453 bytes of text.
>
> Patch was boot-tested on x86_64 and MIPS (big-endian) machines.
> Performance tests were ran on userspace with code like this:
>
> /* addr[] is filled from /dev/urandom */
> start = clock();
> while (ret < nbits)
> ret = find_next_bit(addr, nbits, ret + 1);
>
> end = clock();
> printf("%ld\t", (unsigned long) end - start);
> On Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz rezults are next:
> (for find_next_bit, nbits is 8M, for find_first_bit - 80K)
>
> find_next_bit: find_first_bit:
> new current new current
> 26932 43151 14777 14925
> 26947 43182 14521 15423
I'll look at this more carefully, but one immediate thought is that this
is an unrealistic benchmark. It will amost never need to look at more
than one word of the array, but real arrays have long runs of zero
bits to skip over.
So the code size is appreciated, but the time benefits may be the result
of you optimizing for the wrong thing.
I'd try filling the array with mostly-identical bits, flipping with odds
of 1/256 or so.
For full generality, I'd test different 1->0 and 0->1 transition
probabilities. (But powers of two are probably enough for benchmarking.)
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 1/3] lib: find_*_bit reimplementation
2015-01-31 20:58 [PATCH v2 1/3] lib: find_*_bit reimplementation yury.norov
` (2 preceding siblings ...)
2015-02-02 3:17 ` [PATCH v2 1/3] lib: find_*_bit reimplementation George Spelvin
@ 2015-02-02 10:43 ` Rasmus Villemoes
2015-02-02 11:47 ` George Spelvin
2015-02-04 22:52 ` Yury
3 siblings, 2 replies; 14+ messages in thread
From: Rasmus Villemoes @ 2015-02-02 10:43 UTC (permalink / raw)
To: yury.norov
Cc: klimov.linux, davem, akpm, hannes, dborkman, laijs,
takahiro.akashi, valentinrothberg, linux, msalter, chris, tgraf,
linux-kernel, Yury Norov
On Sat, Jan 31 2015, yury.norov@gmail.com wrote:
> From: Yury Norov <y.norov@samsung.com>
>
> New implementations takes less space in source file (see diffstat)
> and in object. For me it's 710 vs 453 bytes of text.
>
New version generally looks good. Please include a summary of the
changes between the versions either below the --- line or in a 0/n cover
letter, especially since you've now expanded the scope of the series.
Comments below.
>
> Patch was boot-tested on x86_64 and MIPS (big-endian) machines.
> Performance tests were ran on userspace with code like this:
>
> /* addr[] is filled from /dev/urandom */
> start = clock();
> while (ret < nbits)
> ret = find_next_bit(addr, nbits, ret + 1);
>
> end = clock();
> printf("%ld\t", (unsigned long) end - start);
>
> On Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz rezults are next:
> (for find_next_bit, nbits is 8M, for find_first_bit - 80K)
>
> find_next_bit: find_first_bit:
> new current new current
> 26932 43151 14777 14925
> 26947 43182 14521 15423
> 26507 43824 15053 14705
> 27329 43759 14473 14777
> 26895 43367 14847 15023
> 26990 43693 15103 15163
> 26775 43299 15067 15232
> 27282 42752 14544 15121
> 27504 43088 14644 14858
> 26761 43856 14699 15193
> 26692 43075 14781 14681
> 27137 42969 14451 15061
> ... ...
>
> find_next_bit performance gain is 35-40%;
> find_first_bit - no measurable difference.
>
> Signed-off-by: Yury Norov <yury.norov@gmail.com>
> ---
> lib/find_last_bit.c | 31 ++-----
> lib/find_next_bit.c | 254 +++++++++++++++-------------------------------------
> 2 files changed, 79 insertions(+), 206 deletions(-)
>
> diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
> index 91ca09f..e67e970 100644
> --- a/lib/find_last_bit.c
> +++ b/lib/find_last_bit.c
> @@ -4,44 +4,29 @@
> * Written by Rusty Russell <rusty@rustcorp.com.au>
> * (Inspired by David Howell's find_next_bit implementation)
> *
> + * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
> + * size and improve performance, 2015.
> + *
> * This program is free software; you can redistribute it and/or
> * modify it under the terms of the GNU General Public License
> * as published by the Free Software Foundation; either version
> * 2 of the License, or (at your option) any later version.
> */
>
> -#include <linux/bitops.h>
Why do you remove that #include? It is rather important that the header
and implementation don't get out of sync. I know that kernel.h includes
bitops.h, but please don't rely on such things. Quoting SubmitChecklist:
1: If you use a facility then #include the file that defines/declares
that facility. Don't depend on other header files pulling in ones
that you use.
> #include <linux/export.h>
> -#include <asm/types.h>
> -#include <asm/byteorder.h>
However, getting rid of includes that are no longer needed is certainly
a good thing.
> +#include <linux/kernel.h>
>
> #ifndef find_last_bit
>
> unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
> {
> - unsigned long words;
> - unsigned long tmp;
> -
> - /* Start at final word. */
> - words = size / BITS_PER_LONG;
> -
> - /* Partial final word? */
> - if (size & (BITS_PER_LONG-1)) {
> - tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
> - - (size & (BITS_PER_LONG-1)))));
> - if (tmp)
> - goto found;
> - }
> + unsigned long idx = DIV_ROUND_UP(size, BITS_PER_LONG);
>
> - while (words) {
> - tmp = addr[--words];
> - if (tmp) {
> -found:
> - return words * BITS_PER_LONG + __fls(tmp);
> - }
> + while (idx--) {
> + if (addr[idx])
> + return min(idx * BITS_PER_LONG + __fls(addr[idx]), size);
> }
>
> - /* Not found */
> return size;
> }
> EXPORT_SYMBOL(find_last_bit);
> diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
> index 0cbfc0b..ebfb3dc 100644
> --- a/lib/find_next_bit.c
> +++ b/lib/find_next_bit.c
> @@ -3,18 +3,45 @@
> * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
> * Written by David Howells (dhowells@redhat.com)
> *
> + * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
> + * size and improve performance, 2015.
> + *
> * This program is free software; you can redistribute it and/or
> * modify it under the terms of the GNU General Public License
> * as published by the Free Software Foundation; either version
> * 2 of the License, or (at your option) any later version.
> */
>
> -#include <linux/bitops.h>
> #include <linux/export.h>
> -#include <asm/types.h>
> -#include <asm/byteorder.h>
> +#include <linux/kernel.h>
Same as above.
> +#define HIGH_BITS_MASK(nr) (ULONG_MAX << (nr))
> +
> +#if !defined(find_next_bit) || !defined(find_next_zero_bit)
> +static unsigned long _find_next_bit(const unsigned long *addr,
> + unsigned long nbits, unsigned long start, bool set)
> +{
> + unsigned long tmp = set ? addr[start / BITS_PER_LONG]
> + : ~addr[start / BITS_PER_LONG];
> +
> + /* Handle 1st word. */
> + if (!IS_ALIGNED(start, BITS_PER_LONG)) {
> + tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);
> + start = round_down(start, BITS_PER_LONG);
> + }
>
> -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
> + while (!tmp) {
> + start += BITS_PER_LONG;
> + if (start >= nbits)
> + return nbits;
> +
> + tmp = set ? addr[start / BITS_PER_LONG]
> + : ~addr[start / BITS_PER_LONG];
> + }
> +
> + return start + __ffs(tmp);
> +}
> +#endif
>
> #ifndef find_next_bit
> /*
> @@ -23,86 +50,22 @@
> unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
> unsigned long offset)
> {
> - const unsigned long *p = addr + BITOP_WORD(offset);
> - unsigned long result = offset & ~(BITS_PER_LONG-1);
> - unsigned long tmp;
> -
> if (offset >= size)
> return size;
Why can't this ...
> - size -= result;
> - offset %= BITS_PER_LONG;
> - if (offset) {
> - tmp = *(p++);
> - tmp &= (~0UL << offset);
> - if (size < BITS_PER_LONG)
> - goto found_first;
> - if (tmp)
> - goto found_middle;
> - size -= BITS_PER_LONG;
> - result += BITS_PER_LONG;
> - }
> - while (size & ~(BITS_PER_LONG-1)) {
> - if ((tmp = *(p++)))
> - goto found_middle;
> - result += BITS_PER_LONG;
> - size -= BITS_PER_LONG;
> - }
> - if (!size)
> - return result;
> - tmp = *p;
>
> -found_first:
> - tmp &= (~0UL >> (BITS_PER_LONG - size));
> - if (tmp == 0UL) /* Are any bits set? */
> - return result + size; /* Nope. */
> -found_middle:
> - return result + __ffs(tmp);
> + return min(_find_next_bit(addr, size, offset, 1), size);
... and this be part of _find_next_bit? Can find_next_bit not be simply
'return _find_next_bit(addr, size, offset, 1);', and similarly for
find_next_zero_bit? Btw., passing true and false for the boolean
parameter may be a little clearer.
> }
> EXPORT_SYMBOL(find_next_bit);
> #endif
>
> #ifndef find_next_zero_bit
> -/*
> - * This implementation of find_{first,next}_zero_bit was stolen from
> - * Linus' asm-alpha/bitops.h.
> - */
> unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
> unsigned long offset)
> {
> - const unsigned long *p = addr + BITOP_WORD(offset);
> - unsigned long result = offset & ~(BITS_PER_LONG-1);
> - unsigned long tmp;
> -
> if (offset >= size)
> return size;
See above.
> - size -= result;
> - offset %= BITS_PER_LONG;
> - if (offset) {
> - tmp = *(p++);
> - tmp |= ~0UL >> (BITS_PER_LONG - offset);
> - if (size < BITS_PER_LONG)
> - goto found_first;
> - if (~tmp)
> - goto found_middle;
> - size -= BITS_PER_LONG;
> - result += BITS_PER_LONG;
> - }
> - while (size & ~(BITS_PER_LONG-1)) {
> - if (~(tmp = *(p++)))
> - goto found_middle;
> - result += BITS_PER_LONG;
> - size -= BITS_PER_LONG;
> - }
> - if (!size)
> - return result;
> - tmp = *p;
>
> -found_first:
> - tmp |= ~0UL << size;
> - if (tmp == ~0UL) /* Are any bits zero? */
> - return result + size; /* Nope. */
> -found_middle:
> - return result + ffz(tmp);
> + return min(_find_next_bit(addr, size, offset, 0), size);
See above.
> }
> EXPORT_SYMBOL(find_next_zero_bit);
> #endif
> @@ -113,24 +76,14 @@ EXPORT_SYMBOL(find_next_zero_bit);
> */
> unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
> {
> - const unsigned long *p = addr;
> - unsigned long result = 0;
> - unsigned long tmp;
> + unsigned long idx;
>
> - while (size & ~(BITS_PER_LONG-1)) {
> - if ((tmp = *(p++)))
> - goto found;
> - result += BITS_PER_LONG;
> - size -= BITS_PER_LONG;
> + for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
> + if (addr[idx])
> + return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
> }
> - if (!size)
> - return result;
>
> - tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
> - if (tmp == 0UL) /* Are any bits set? */
> - return result + size; /* Nope. */
> -found:
> - return result + __ffs(tmp);
> + return size;
> }
> EXPORT_SYMBOL(find_first_bit);
> #endif
> @@ -141,24 +94,14 @@ EXPORT_SYMBOL(find_first_bit);
> */
> unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
> {
> - const unsigned long *p = addr;
> - unsigned long result = 0;
> - unsigned long tmp;
> + unsigned long idx;
>
> - while (size & ~(BITS_PER_LONG-1)) {
> - if (~(tmp = *(p++)))
> - goto found;
> - result += BITS_PER_LONG;
> - size -= BITS_PER_LONG;
> + for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
> + if (addr[idx] != ULONG_MAX)
> + return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
> }
> - if (!size)
> - return result;
>
> - tmp = (*p) | (~0UL << size);
> - if (tmp == ~0UL) /* Are any bits zero? */
> - return result + size; /* Nope. */
> -found:
> - return result + ffz(tmp);
> + return size;
> }
> EXPORT_SYMBOL(find_first_zero_bit);
> #endif
> @@ -166,18 +109,6 @@ EXPORT_SYMBOL(find_first_zero_bit);
> #ifdef __BIG_ENDIAN
>
> /* include/linux/byteorder does not support "unsigned long" type */
> -static inline unsigned long ext2_swabp(const unsigned long * x)
> -{
> -#if BITS_PER_LONG == 64
> - return (unsigned long) __swab64p((u64 *) x);
> -#elif BITS_PER_LONG == 32
> - return (unsigned long) __swab32p((u32 *) x);
> -#else
> -#error BITS_PER_LONG not defined
> -#endif
> -}
> -
> -/* include/linux/byteorder doesn't support "unsigned long" type */
> static inline unsigned long ext2_swab(const unsigned long y)
> {
> #if BITS_PER_LONG == 64
> @@ -189,48 +120,40 @@ static inline unsigned long ext2_swab(const unsigned long y)
> #endif
> }
>
> +#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
> +static unsigned long _find_next_bit_le(const unsigned long *addr,
> + unsigned long nbits, unsigned long start, bool set)
> +{
> + unsigned long tmp = set ? addr[start / BITS_PER_LONG]
> + : ~addr[start / BITS_PER_LONG];
> +
> + /* Handle 1st word. */
> + if (!IS_ALIGNED(start, BITS_PER_LONG)) {
> + tmp &= ext2_swab(HIGH_BITS_MASK(start % BITS_PER_LONG));
> + start = round_down(start, BITS_PER_LONG);
> + }
> +
> + while (!tmp) {
> + start += BITS_PER_LONG;
> + if (start >= nbits)
> + return nbits;
> +
> + tmp = set ? addr[start / BITS_PER_LONG]
> + : ~addr[start / BITS_PER_LONG];
> + }
> +
> + return start + __ffs(ext2_swab(tmp));
> +}
> +#endif
> +
> #ifndef find_next_zero_bit_le
> unsigned long find_next_zero_bit_le(const void *addr, unsigned
> long size, unsigned long offset)
> {
> - const unsigned long *p = addr;
> - unsigned long result = offset & ~(BITS_PER_LONG - 1);
> - unsigned long tmp;
> -
> if (offset >= size)
> return size;
Again, I think this should be moved to the common implementation in
_find_next_bit_le, and similarly for find_next_bit_le below.
> - p += BITOP_WORD(offset);
> - size -= result;
> - offset &= (BITS_PER_LONG - 1UL);
> - if (offset) {
> - tmp = ext2_swabp(p++);
> - tmp |= (~0UL >> (BITS_PER_LONG - offset));
> - if (size < BITS_PER_LONG)
> - goto found_first;
> - if (~tmp)
> - goto found_middle;
> - size -= BITS_PER_LONG;
> - result += BITS_PER_LONG;
> - }
>
> - while (size & ~(BITS_PER_LONG - 1)) {
> - if (~(tmp = *(p++)))
> - goto found_middle_swap;
> - result += BITS_PER_LONG;
> - size -= BITS_PER_LONG;
> - }
> - if (!size)
> - return result;
> - tmp = ext2_swabp(p);
> -found_first:
> - tmp |= ~0UL << size;
> - if (tmp == ~0UL) /* Are any bits zero? */
> - return result + size; /* Nope. Skip ffz */
> -found_middle:
> - return result + ffz(tmp);
> -
> -found_middle_swap:
> - return result + ffz(ext2_swab(tmp));
> + return min(_find_next_bit_le(addr, size, offset, 0), size);
> }
> EXPORT_SYMBOL(find_next_zero_bit_le);
> #endif
> @@ -239,45 +162,10 @@ EXPORT_SYMBOL(find_next_zero_bit_le);
> unsigned long find_next_bit_le(const void *addr, unsigned
> long size, unsigned long offset)
> {
> - const unsigned long *p = addr;
> - unsigned long result = offset & ~(BITS_PER_LONG - 1);
> - unsigned long tmp;
> -
> if (offset >= size)
> return size;
> - p += BITOP_WORD(offset);
> - size -= result;
> - offset &= (BITS_PER_LONG - 1UL);
> - if (offset) {
> - tmp = ext2_swabp(p++);
> - tmp &= (~0UL << offset);
> - if (size < BITS_PER_LONG)
> - goto found_first;
> - if (tmp)
> - goto found_middle;
> - size -= BITS_PER_LONG;
> - result += BITS_PER_LONG;
> - }
> -
> - while (size & ~(BITS_PER_LONG - 1)) {
> - tmp = *(p++);
> - if (tmp)
> - goto found_middle_swap;
> - result += BITS_PER_LONG;
> - size -= BITS_PER_LONG;
> - }
> - if (!size)
> - return result;
> - tmp = ext2_swabp(p);
> -found_first:
> - tmp &= (~0UL >> (BITS_PER_LONG - size));
> - if (tmp == 0UL) /* Are any bits set? */
> - return result + size; /* Nope. */
> -found_middle:
> - return result + __ffs(tmp);
>
> -found_middle_swap:
> - return result + __ffs(ext2_swab(tmp));
> + return min(_find_next_bit_le(addr, size, offset, 1), size);
> }
> EXPORT_SYMBOL(find_next_bit_le);
> #endif
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 3/3] lib: rename lib/find_next_bit.c to lib/find_bit.c
2015-01-31 20:58 ` [PATCH v2 3/3] lib: rename lib/find_next_bit.c to lib/find_bit.c yury.norov
@ 2015-02-02 11:09 ` Rasmus Villemoes
0 siblings, 0 replies; 14+ messages in thread
From: Rasmus Villemoes @ 2015-02-02 11:09 UTC (permalink / raw)
To: yury.norov
Cc: klimov.linux, davem, akpm, hannes, dborkman, laijs,
takahiro.akashi, valentinrothberg, linux, msalter, chris, tgraf,
linux-kernel
On Sat, Jan 31 2015, yury.norov@gmail.com wrote:
> From: Yury Norov <yury.norov@gmail.com>
>
> This file contains implementation for:
> - find_last_bit;
> - find_first_zero_bit;
> - find_first_bit;
> - find_next_zero_bit;
> - find_next_bit.
>
[and a few _le variants]
> So giving more generic name looks reasonable.
It does. But it is a little annoying that it is not shown as a pure
rename.
> Signed-off-by: Yury Norov <yury.norov@gmail.com>
> ---
> lib/Makefile | 2 +-
> lib/find_bit.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> lib/find_next_bit.c | 192 ---------------------------------------------------
> 3 files changed, 195 insertions(+), 193 deletions(-)
> create mode 100644 lib/find_bit.c
> delete mode 100644 lib/find_next_bit.c
>
> diff --git a/lib/Makefile b/lib/Makefile
> index 13990aa..1cc93f4 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -25,7 +25,7 @@ obj-y += lockref.o
> obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
> bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
> gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
> - bsearch.o find_next_bit.o llist.o memweight.o kfifo.o \
> + bsearch.o find_bit.o llist.o memweight.o kfifo.o \
> percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
> obj-y += string_helpers.o
> obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
> diff --git a/lib/find_bit.c b/lib/find_bit.c
> new file mode 100644
> index 0000000..236a850
> --- /dev/null
> +++ b/lib/find_bit.c
> @@ -0,0 +1,194 @@
> +/* find_bit.c: generic implementation fore:
> + * find_last_bit; find_first_zero_bit; find_first_bit;
> + * find_next_zero_bit; find_next_bit.
> + *
[trivial typo: s/fore/for/]
I _think_ the cause of the 2-line descrepancy is this change in a comment,
but it's hard to tell for sure. There's no simple way of telling whether the
other 192 lines are actually the same or if subtle changes have been
introduced.
This is one reason hard-coding the name of a file inside the file is a
bad idea. I'd suggest tweaking that comment in one of the earlier
patches, for example the one moving find_last_bit to find_next_bit.c,
making sure to remove the filename. Then this patch can be a simple 'git
mv' and the same two-line change of the Makefile.
Rasmus
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 1/3] lib: find_*_bit reimplementation
2015-02-02 10:43 ` Rasmus Villemoes
@ 2015-02-02 11:47 ` George Spelvin
2015-02-02 12:56 ` Rasmus Villemoes
2015-02-04 22:52 ` Yury
1 sibling, 1 reply; 14+ messages in thread
From: George Spelvin @ 2015-02-02 11:47 UTC (permalink / raw)
To: linux, yury.norov
Cc: akpm, chris, davem, dborkman, hannes, klimov.linux, laijs,
linux-kernel, linux, msalter, takahiro.akashi, tgraf,
valentinrothberg, y.norov
Rasmus Villemoes <linux@rasmusvillemoes.dk> wrote:
> ... and this be part of _find_next_bit? Can find_next_bit not be simply
> 'return _find_next_bit(addr, size, offset, 1);', and similarly for
> find_next_zero_bit? Btw., passing true and false for the boolean
> parameter may be a little clearer.
Looking at the generated code, it would be better to replace the boolean
parameter with 0ul or ~0ul and XOR with it. The same number of registers,
and saves a conditional branch.
(I was hoping GCC would figure that trick out, but it didn't.)
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 1/3] lib: find_*_bit reimplementation
2015-02-02 11:47 ` George Spelvin
@ 2015-02-02 12:56 ` Rasmus Villemoes
2015-02-04 23:45 ` Yury
0 siblings, 1 reply; 14+ messages in thread
From: Rasmus Villemoes @ 2015-02-02 12:56 UTC (permalink / raw)
To: George Spelvin
Cc: yury.norov, akpm, chris, davem, dborkman, hannes, klimov.linux,
laijs, linux-kernel, msalter, takahiro.akashi, tgraf,
valentinrothberg, y.norov
On Mon, Feb 02 2015, "George Spelvin" <linux@horizon.com> wrote:
> Rasmus Villemoes <linux@rasmusvillemoes.dk> wrote:
>> ... and this be part of _find_next_bit? Can find_next_bit not be simply
>> 'return _find_next_bit(addr, size, offset, 1);', and similarly for
>> find_next_zero_bit? Btw., passing true and false for the boolean
>> parameter may be a little clearer.
>
> Looking at the generated code, it would be better to replace the boolean
> parameter with 0ul or ~0ul and XOR with it. The same number of registers,
> and saves a conditional branch.
Nice trick. When I compiled it, gcc inlined _find_next_bit into both its
callers, making the conditional go away completely. That was with gcc
4.7. When I try with 5.0, I do see _find_next_bit being compiled
separately.
With the proposed change, 4.7 also makes find_next{,_zero}_bit wrappers
for _find_next_bit, further reducing the total size, which is a good
thing. And, if some other version decides to still inline it, it
should then know how to optimize the xor with 0ul or ~0ul just as well
as when the conditional was folded away.
Yury, please also incorporate this in the next round.
Rasmus
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 1/3] lib: find_*_bit reimplementation
2015-02-02 10:43 ` Rasmus Villemoes
2015-02-02 11:47 ` George Spelvin
@ 2015-02-04 22:52 ` Yury
2015-02-05 15:01 ` Rasmus Villemoes
1 sibling, 1 reply; 14+ messages in thread
From: Yury @ 2015-02-04 22:52 UTC (permalink / raw)
To: Rasmus Villemoes
Cc: klimov.linux, davem, akpm, hannes, dborkman, laijs,
takahiro.akashi, valentinrothberg, linux, msalter, chris, tgraf,
linux-kernel, Yury Norov
On 02.02.2015 13:43, Rasmus Villemoes wrote:
> On Sat, Jan 31 2015, yury.norov@gmail.com wrote:
>
>> From: Yury Norov <y.norov@samsung.com>
>>
>> New implementations takes less space in source file (see diffstat)
>> and in object. For me it's 710 vs 453 bytes of text.
>>
> New version generally looks good. Please include a summary of the
> changes between the versions either below the --- line or in a 0/n cover
> letter, especially since you've now expanded the scope of the series.
>
> Comments below.
>
>> Patch was boot-tested on x86_64 and MIPS (big-endian) machines.
>> Performance tests were ran on userspace with code like this:
>>
>> /* addr[] is filled from /dev/urandom */
>> start = clock();
>> while (ret < nbits)
>> ret = find_next_bit(addr, nbits, ret + 1);
>>
>> end = clock();
>> printf("%ld\t", (unsigned long) end - start);
>>
>> On Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz rezults are next:
>> (for find_next_bit, nbits is 8M, for find_first_bit - 80K)
>>
>> find_next_bit: find_first_bit:
>> new current new current
>> 26932 43151 14777 14925
>> 26947 43182 14521 15423
>> 26507 43824 15053 14705
>> 27329 43759 14473 14777
>> 26895 43367 14847 15023
>> 26990 43693 15103 15163
>> 26775 43299 15067 15232
>> 27282 42752 14544 15121
>> 27504 43088 14644 14858
>> 26761 43856 14699 15193
>> 26692 43075 14781 14681
>> 27137 42969 14451 15061
>> ... ...
>>
>> find_next_bit performance gain is 35-40%;
>> find_first_bit - no measurable difference.
>>
>> Signed-off-by: Yury Norov <yury.norov@gmail.com>
>> ---
>> lib/find_last_bit.c | 31 ++-----
>> lib/find_next_bit.c | 254 +++++++++++++++-------------------------------------
>> 2 files changed, 79 insertions(+), 206 deletions(-)
>>
>> diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
>> index 91ca09f..e67e970 100644
>> --- a/lib/find_last_bit.c
>> +++ b/lib/find_last_bit.c
>> @@ -4,44 +4,29 @@
>> * Written by Rusty Russell <rusty@rustcorp.com.au>
>> * (Inspired by David Howell's find_next_bit implementation)
>> *
>> + * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
>> + * size and improve performance, 2015.
>> + *
>> * This program is free software; you can redistribute it and/or
>> * modify it under the terms of the GNU General Public License
>> * as published by the Free Software Foundation; either version
>> * 2 of the License, or (at your option) any later version.
>> */
>>
>> -#include <linux/bitops.h>
> Why do you remove that #include? It is rather important that the header
> and implementation don't get out of sync. I know that kernel.h includes
> bitops.h, but please don't rely on such things. Quoting SubmitChecklist:
>
> 1: If you use a facility then #include the file that defines/declares
> that facility. Don't depend on other header files pulling in ones
> that you use.
>
>
>> #include <linux/export.h>
>> -#include <asm/types.h>
>> -#include <asm/byteorder.h>
> However, getting rid of includes that are no longer needed is certainly
> a good thing.
Yes, linux/bitops.h are to get back.
>> +#include <linux/kernel.h>
>>
>> #ifndef find_last_bit
>>
>> unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
>> {
>> - unsigned long words;
>> - unsigned long tmp;
>> -
>> - /* Start at final word. */
>> - words = size / BITS_PER_LONG;
>> -
>> - /* Partial final word? */
>> - if (size & (BITS_PER_LONG-1)) {
>> - tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
>> - - (size & (BITS_PER_LONG-1)))));
>> - if (tmp)
>> - goto found;
>> - }
>> + unsigned long idx = DIV_ROUND_UP(size, BITS_PER_LONG);
>>
>> - while (words) {
>> - tmp = addr[--words];
>> - if (tmp) {
>> -found:
>> - return words * BITS_PER_LONG + __fls(tmp);
>> - }
>> + while (idx--) {
>> + if (addr[idx])
>> + return min(idx * BITS_PER_LONG + __fls(addr[idx]), size);
>> }
>>
>> - /* Not found */
>> return size;
>> }
>> EXPORT_SYMBOL(find_last_bit);
>> diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
>> index 0cbfc0b..ebfb3dc 100644
>> --- a/lib/find_next_bit.c
>> +++ b/lib/find_next_bit.c
>> @@ -3,18 +3,45 @@
>> * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
>> * Written by David Howells (dhowells@redhat.com)
>> *
>> + * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
>> + * size and improve performance, 2015.
>> + *
>> * This program is free software; you can redistribute it and/or
>> * modify it under the terms of the GNU General Public License
>> * as published by the Free Software Foundation; either version
>> * 2 of the License, or (at your option) any later version.
>> */
>>
>> -#include <linux/bitops.h>
>> #include <linux/export.h>
>> -#include <asm/types.h>
>> -#include <asm/byteorder.h>
>> +#include <linux/kernel.h>
> Same as above.
>
>> +#define HIGH_BITS_MASK(nr) (ULONG_MAX << (nr))
>> +
>> +#if !defined(find_next_bit) || !defined(find_next_zero_bit)
>> +static unsigned long _find_next_bit(const unsigned long *addr,
>> + unsigned long nbits, unsigned long start, bool set)
>> +{
>> + unsigned long tmp = set ? addr[start / BITS_PER_LONG]
>> + : ~addr[start / BITS_PER_LONG];
>> +
>> + /* Handle 1st word. */
>> + if (!IS_ALIGNED(start, BITS_PER_LONG)) {
>> + tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);
>> + start = round_down(start, BITS_PER_LONG);
>> + }
>>
>> -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
>> + while (!tmp) {
>> + start += BITS_PER_LONG;
>> + if (start >= nbits)
>> + return nbits;
>> +
>> + tmp = set ? addr[start / BITS_PER_LONG]
>> + : ~addr[start / BITS_PER_LONG];
>> + }
>> +
>> + return start + __ffs(tmp);
>> +}
>> +#endif
>>
>> #ifndef find_next_bit
>> /*
>> @@ -23,86 +50,22 @@
>> unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
>> unsigned long offset)
>> {
>> - const unsigned long *p = addr + BITOP_WORD(offset);
>> - unsigned long result = offset & ~(BITS_PER_LONG-1);
>> - unsigned long tmp;
>> -
>> if (offset >= size)
>> return size;
> Why can't this ...
>
>
>> - size -= result;
>> - offset %= BITS_PER_LONG;
>> - if (offset) {
>> - tmp = *(p++);
>> - tmp &= (~0UL << offset);
>> - if (size < BITS_PER_LONG)
>> - goto found_first;
>> - if (tmp)
>> - goto found_middle;
>> - size -= BITS_PER_LONG;
>> - result += BITS_PER_LONG;
>> - }
>> - while (size & ~(BITS_PER_LONG-1)) {
>> - if ((tmp = *(p++)))
>> - goto found_middle;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> - }
>> - if (!size)
>> - return result;
>> - tmp = *p;
>>
>> -found_first:
>> - tmp &= (~0UL >> (BITS_PER_LONG - size));
>> - if (tmp == 0UL) /* Are any bits set? */
>> - return result + size; /* Nope. */
>> -found_middle:
>> - return result + __ffs(tmp);
>> + return min(_find_next_bit(addr, size, offset, 1), size);
> ... and this be part of _find_next_bit? Can find_next_bit not be simply
> 'return _find_next_bit(addr, size, offset, 1);', and similarly for
> find_next_zero_bit? Btw., passing true and false for the boolean
> parameter may be a little clearer.
I moved size checkers out of '_find_next_bit' to let user call it from his code
if he knows for sure that size/offset pair is valid. This may help save a couple
of clocks. I think, I'll walk over the code to find how many such places we have.
If not too much / not in critical paths, checks may be moved into the function.
Or maybe leave as is and place some comment for future?...
>
>> }
>> EXPORT_SYMBOL(find_next_bit);
>> #endif
>>
>> #ifndef find_next_zero_bit
>> -/*
>> - * This implementation of find_{first,next}_zero_bit was stolen from
>> - * Linus' asm-alpha/bitops.h.
>> - */
>> unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
>> unsigned long offset)
>> {
>> - const unsigned long *p = addr + BITOP_WORD(offset);
>> - unsigned long result = offset & ~(BITS_PER_LONG-1);
>> - unsigned long tmp;
>> -
>> if (offset >= size)
>> return size;
> See above.
>
>> - size -= result;
>> - offset %= BITS_PER_LONG;
>> - if (offset) {
>> - tmp = *(p++);
>> - tmp |= ~0UL >> (BITS_PER_LONG - offset);
>> - if (size < BITS_PER_LONG)
>> - goto found_first;
>> - if (~tmp)
>> - goto found_middle;
>> - size -= BITS_PER_LONG;
>> - result += BITS_PER_LONG;
>> - }
>> - while (size & ~(BITS_PER_LONG-1)) {
>> - if (~(tmp = *(p++)))
>> - goto found_middle;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> - }
>> - if (!size)
>> - return result;
>> - tmp = *p;
>>
>> -found_first:
>> - tmp |= ~0UL << size;
>> - if (tmp == ~0UL) /* Are any bits zero? */
>> - return result + size; /* Nope. */
>> -found_middle:
>> - return result + ffz(tmp);
>> + return min(_find_next_bit(addr, size, offset, 0), size);
> See above.
>
>> }
>> EXPORT_SYMBOL(find_next_zero_bit);
>> #endif
>> @@ -113,24 +76,14 @@ EXPORT_SYMBOL(find_next_zero_bit);
>> */
>> unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
>> {
>> - const unsigned long *p = addr;
>> - unsigned long result = 0;
>> - unsigned long tmp;
>> + unsigned long idx;
>>
>> - while (size & ~(BITS_PER_LONG-1)) {
>> - if ((tmp = *(p++)))
>> - goto found;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> + for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
>> + if (addr[idx])
>> + return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
>> }
>> - if (!size)
>> - return result;
>>
>> - tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
>> - if (tmp == 0UL) /* Are any bits set? */
>> - return result + size; /* Nope. */
>> -found:
>> - return result + __ffs(tmp);
>> + return size;
>> }
>> EXPORT_SYMBOL(find_first_bit);
>> #endif
>> @@ -141,24 +94,14 @@ EXPORT_SYMBOL(find_first_bit);
>> */
>> unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
>> {
>> - const unsigned long *p = addr;
>> - unsigned long result = 0;
>> - unsigned long tmp;
>> + unsigned long idx;
>>
>> - while (size & ~(BITS_PER_LONG-1)) {
>> - if (~(tmp = *(p++)))
>> - goto found;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> + for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
>> + if (addr[idx] != ULONG_MAX)
>> + return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
>> }
>> - if (!size)
>> - return result;
>>
>> - tmp = (*p) | (~0UL << size);
>> - if (tmp == ~0UL) /* Are any bits zero? */
>> - return result + size; /* Nope. */
>> -found:
>> - return result + ffz(tmp);
>> + return size;
>> }
>> EXPORT_SYMBOL(find_first_zero_bit);
>> #endif
>> @@ -166,18 +109,6 @@ EXPORT_SYMBOL(find_first_zero_bit);
>> #ifdef __BIG_ENDIAN
>>
>> /* include/linux/byteorder does not support "unsigned long" type */
>> -static inline unsigned long ext2_swabp(const unsigned long * x)
>> -{
>> -#if BITS_PER_LONG == 64
>> - return (unsigned long) __swab64p((u64 *) x);
>> -#elif BITS_PER_LONG == 32
>> - return (unsigned long) __swab32p((u32 *) x);
>> -#else
>> -#error BITS_PER_LONG not defined
>> -#endif
>> -}
>> -
>> -/* include/linux/byteorder doesn't support "unsigned long" type */
>> static inline unsigned long ext2_swab(const unsigned long y)
>> {
>> #if BITS_PER_LONG == 64
>> @@ -189,48 +120,40 @@ static inline unsigned long ext2_swab(const unsigned long y)
>> #endif
>> }
>>
>> +#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
>> +static unsigned long _find_next_bit_le(const unsigned long *addr,
>> + unsigned long nbits, unsigned long start, bool set)
>> +{
>> + unsigned long tmp = set ? addr[start / BITS_PER_LONG]
>> + : ~addr[start / BITS_PER_LONG];
>> +
>> + /* Handle 1st word. */
>> + if (!IS_ALIGNED(start, BITS_PER_LONG)) {
>> + tmp &= ext2_swab(HIGH_BITS_MASK(start % BITS_PER_LONG));
>> + start = round_down(start, BITS_PER_LONG);
>> + }
>> +
>> + while (!tmp) {
>> + start += BITS_PER_LONG;
>> + if (start >= nbits)
>> + return nbits;
>> +
>> + tmp = set ? addr[start / BITS_PER_LONG]
>> + : ~addr[start / BITS_PER_LONG];
>> + }
>> +
>> + return start + __ffs(ext2_swab(tmp));
>> +}
>> +#endif
>> +
>> #ifndef find_next_zero_bit_le
>> unsigned long find_next_zero_bit_le(const void *addr, unsigned
>> long size, unsigned long offset)
>> {
>> - const unsigned long *p = addr;
>> - unsigned long result = offset & ~(BITS_PER_LONG - 1);
>> - unsigned long tmp;
>> -
>> if (offset >= size)
>> return size;
> Again, I think this should be moved to the common implementation in
> _find_next_bit_le, and similarly for find_next_bit_le below.
>
>> - p += BITOP_WORD(offset);
>> - size -= result;
>> - offset &= (BITS_PER_LONG - 1UL);
>> - if (offset) {
>> - tmp = ext2_swabp(p++);
>> - tmp |= (~0UL >> (BITS_PER_LONG - offset));
>> - if (size < BITS_PER_LONG)
>> - goto found_first;
>> - if (~tmp)
>> - goto found_middle;
>> - size -= BITS_PER_LONG;
>> - result += BITS_PER_LONG;
>> - }
>>
>> - while (size & ~(BITS_PER_LONG - 1)) {
>> - if (~(tmp = *(p++)))
>> - goto found_middle_swap;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> - }
>> - if (!size)
>> - return result;
>> - tmp = ext2_swabp(p);
>> -found_first:
>> - tmp |= ~0UL << size;
>> - if (tmp == ~0UL) /* Are any bits zero? */
>> - return result + size; /* Nope. Skip ffz */
>> -found_middle:
>> - return result + ffz(tmp);
>> -
>> -found_middle_swap:
>> - return result + ffz(ext2_swab(tmp));
>> + return min(_find_next_bit_le(addr, size, offset, 0), size);
>> }
>> EXPORT_SYMBOL(find_next_zero_bit_le);
>> #endif
>> @@ -239,45 +162,10 @@ EXPORT_SYMBOL(find_next_zero_bit_le);
>> unsigned long find_next_bit_le(const void *addr, unsigned
>> long size, unsigned long offset)
>> {
>> - const unsigned long *p = addr;
>> - unsigned long result = offset & ~(BITS_PER_LONG - 1);
>> - unsigned long tmp;
>> -
>> if (offset >= size)
>> return size;
>> - p += BITOP_WORD(offset);
>> - size -= result;
>> - offset &= (BITS_PER_LONG - 1UL);
>> - if (offset) {
>> - tmp = ext2_swabp(p++);
>> - tmp &= (~0UL << offset);
>> - if (size < BITS_PER_LONG)
>> - goto found_first;
>> - if (tmp)
>> - goto found_middle;
>> - size -= BITS_PER_LONG;
>> - result += BITS_PER_LONG;
>> - }
>> -
>> - while (size & ~(BITS_PER_LONG - 1)) {
>> - tmp = *(p++);
>> - if (tmp)
>> - goto found_middle_swap;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> - }
>> - if (!size)
>> - return result;
>> - tmp = ext2_swabp(p);
>> -found_first:
>> - tmp &= (~0UL >> (BITS_PER_LONG - size));
>> - if (tmp == 0UL) /* Are any bits set? */
>> - return result + size; /* Nope. */
>> -found_middle:
>> - return result + __ffs(tmp);
>>
>> -found_middle_swap:
>> - return result + __ffs(ext2_swab(tmp));
>> + return min(_find_next_bit_le(addr, size, offset, 1), size);
>> }
>> EXPORT_SYMBOL(find_next_bit_le);
>> #endif
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 1/3] lib: find_*_bit reimplementation
2015-02-02 3:17 ` [PATCH v2 1/3] lib: find_*_bit reimplementation George Spelvin
@ 2015-02-04 23:07 ` Yury
0 siblings, 0 replies; 14+ messages in thread
From: Yury @ 2015-02-04 23:07 UTC (permalink / raw)
To: George Spelvin, akpm, chris, davem, dborkman, hannes,
klimov.linux, laijs, msalter, takahiro.akashi, tgraf,
valentinrothberg
Cc: linux-kernel, y.norov, Yury Norov
On 02.02.2015 06:17, George Spelvin wrote:
> Yury Norov <y.norov@samsung.com> wrote:
>> New implementations takes less space in source file (see diffstat)
>> and in object. For me it's 710 vs 453 bytes of text.
>>
>> Patch was boot-tested on x86_64 and MIPS (big-endian) machines.
>> Performance tests were ran on userspace with code like this:
>>
>> /* addr[] is filled from /dev/urandom */
>> start = clock();
>> while (ret < nbits)
>> ret = find_next_bit(addr, nbits, ret + 1);
>>
>> end = clock();
>> printf("%ld\t", (unsigned long) end - start);
>> On Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz rezults are next:
>> (for find_next_bit, nbits is 8M, for find_first_bit - 80K)
>>
>> find_next_bit: find_first_bit:
>> new current new current
>> 26932 43151 14777 14925
>> 26947 43182 14521 15423
> I'll look at this more carefully, but one immediate thought is that this
> is an unrealistic benchmark. It will amost never need to look at more
> than one word of the array, but real arrays have long runs of zero
> bits to skip over.
>
> So the code size is appreciated, but the time benefits may be the result
> of you optimizing for the wrong thing.
>
> I'd try filling the array with mostly-identical bits, flipping with odds
> of 1/256 or so.
>
> For full generality, I'd test different 1->0 and 0->1 transition
> probabilities. (But powers of two are probably enough for benchmarking.)
>
I think, test with random values represents at least one situation: well-fragmented memory
after long time work. (This is what I really have in my project.) In other hand, if long zero runs
is a typical behavior for one's system, it's a good opportunity for improvements, I think.
Anyway, the idea of testing find_bit on a long runs is good. Thank you.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 1/3] lib: find_*_bit reimplementation
2015-02-02 12:56 ` Rasmus Villemoes
@ 2015-02-04 23:45 ` Yury
2015-02-05 14:51 ` Rasmus Villemoes
0 siblings, 1 reply; 14+ messages in thread
From: Yury @ 2015-02-04 23:45 UTC (permalink / raw)
To: Rasmus Villemoes, George Spelvin
Cc: akpm, chris, davem, dborkman, hannes, klimov.linux, laijs,
linux-kernel, msalter, takahiro.akashi, tgraf, valentinrothberg,
Yury Norov
On 02.02.2015 15:56, Rasmus Villemoes wrote:
> On Mon, Feb 02 2015, "George Spelvin" <linux@horizon.com> wrote:
>
>> Rasmus Villemoes <linux@rasmusvillemoes.dk> wrote:
>>> ... and this be part of _find_next_bit? Can find_next_bit not be simply
>>> 'return _find_next_bit(addr, size, offset, 1);', and similarly for
>>> find_next_zero_bit? Btw., passing true and false for the boolean
>>> parameter may be a little clearer.
>> Looking at the generated code, it would be better to replace the boolean
>> parameter with 0ul or ~0ul and XOR with it. The same number of registers,
>> and saves a conditional branch.
> Nice trick. When I compiled it, gcc inlined _find_next_bit into both its
> callers, making the conditional go away completely. That was with gcc
> 4.7. When I try with 5.0, I do see _find_next_bit being compiled
> separately.
>
> With the proposed change, 4.7 also makes find_next{,_zero}_bit wrappers
> for _find_next_bit, further reducing the total size, which is a good
> thing. And, if some other version decides to still inline it, it
> should then know how to optimize the xor with 0ul or ~0ul just as well
> as when the conditional was folded away.
>
> Yury, please also incorporate this in the next round.
>
> Rasmus
>
Ok.
What are you thinking about joining _find_next_bit and _find_next_bit_le?
They really differ in 2 lines. It's generally good to remove duplications,
and it may decrease text size for big-endian machines. But it definitely
doesn't make code easier for reading, and maybe affects performance
after the optimization suggested by George...
(I didn't test it yet)
29 #if !defined(find_next_bit) || !defined(find_next_zero_bit) \
30 || (defined(BIG_ENDIAN) && \
31 (!defined(find_next_bit_le) || !defined(find_next_zero_bit_le)))
32 static unsigned long _find_next_bit(const unsigned long *addr,
33 unsigned long nbits, unsigned long start, unsigned long flags)
34 {
35 unsigned long xor_mask = (flags & SET) ? 0UL : ULONG_MAX;
36 unsigned long tmp = addr[start / BITS_PER_LONG] ^ xor_mask;
37
38 /* Handle 1st word. */
39 if (!IS_ALIGNED(start, BITS_PER_LONG)) {
40 #ifdef BIG_ENDIAN
41 if (flags & LE)
42 tmp &= ext2_swab(HIGH_BITS_MASK(start % BITS_PER_LONG));
43 else
44 #endif
45 tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);
46
47 start = round_down(start, BITS_PER_LONG);
48 }
49
50 while (!tmp) {
51 start += BITS_PER_LONG;
52 if (start >= nbits)
53 return nbits;
54
55 tmp = addr[start / BITS_PER_LONG] ^ xor_mask;
56 }
57
58 #ifdef BIG_ENDIAN
59 if (flags & LE)
60 return start + __ffs(ext2_swab(tmp));
61
62 #endif
63 return start + __ffs(tmp);
64 }
65 #endif
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 1/3] lib: find_*_bit reimplementation
2015-02-04 23:45 ` Yury
@ 2015-02-05 14:51 ` Rasmus Villemoes
0 siblings, 0 replies; 14+ messages in thread
From: Rasmus Villemoes @ 2015-02-05 14:51 UTC (permalink / raw)
To: Yury
Cc: George Spelvin, akpm, chris, davem, dborkman, hannes,
klimov.linux, laijs, linux-kernel, msalter, takahiro.akashi,
tgraf, valentinrothberg, Yury Norov
On Thu, Feb 05 2015, Yury <yury.norov@gmail.com> wrote:
> On 02.02.2015 15:56, Rasmus Villemoes wrote:
>> On Mon, Feb 02 2015, "George Spelvin" <linux@horizon.com> wrote:
>>
>>> Rasmus Villemoes <linux@rasmusvillemoes.dk> wrote:
>>>> ... and this be part of _find_next_bit? Can find_next_bit not be simply
>>>> 'return _find_next_bit(addr, size, offset, 1);', and similarly for
>>>> find_next_zero_bit? Btw., passing true and false for the boolean
>>>> parameter may be a little clearer.
>>> Looking at the generated code, it would be better to replace the boolean
>>> parameter with 0ul or ~0ul and XOR with it. The same number of registers,
>>> and saves a conditional branch.
>> Nice trick. When I compiled it, gcc inlined _find_next_bit into both its
>> callers, making the conditional go away completely. That was with gcc
>> 4.7. When I try with 5.0, I do see _find_next_bit being compiled
>> separately.
>>
>> With the proposed change, 4.7 also makes find_next{,_zero}_bit wrappers
>> for _find_next_bit, further reducing the total size, which is a good
>> thing. And, if some other version decides to still inline it, it
>> should then know how to optimize the xor with 0ul or ~0ul just as well
>> as when the conditional was folded away.
>>
>> Yury, please also incorporate this in the next round.
>>
>> Rasmus
>>
> Ok.
Good.
> What are you thinking about joining _find_next_bit and
> _find_next_bit_le?
I don't think that should be done right now, if at all. The series is
pretty close to getting my Reviewed-by; I'd prefer not to start over.
Rasmus
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 1/3] lib: find_*_bit reimplementation
2015-02-04 22:52 ` Yury
@ 2015-02-05 15:01 ` Rasmus Villemoes
0 siblings, 0 replies; 14+ messages in thread
From: Rasmus Villemoes @ 2015-02-05 15:01 UTC (permalink / raw)
To: Yury
Cc: klimov.linux, davem, akpm, hannes, dborkman, laijs,
takahiro.akashi, valentinrothberg, linux, msalter, chris, tgraf,
linux-kernel, Yury Norov
On Wed, Feb 04 2015, Yury <yury.norov@gmail.com> wrote:
> On 02.02.2015 13:43, Rasmus Villemoes wrote:
>>> @@ -23,86 +50,22 @@
>>> unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
>>> unsigned long offset)
>>> {
>>> - const unsigned long *p = addr + BITOP_WORD(offset);
>>> - unsigned long result = offset & ~(BITS_PER_LONG-1);
>>> - unsigned long tmp;
>>> -
>>> if (offset >= size)
>>> return size;
>> Why can't this ...
>>
>>
>>> - size -= result;
>>> - offset %= BITS_PER_LONG;
>>> - if (offset) {
>>> - tmp = *(p++);
>>> - tmp &= (~0UL << offset);
>>> - if (size < BITS_PER_LONG)
>>> - goto found_first;
>>> - if (tmp)
>>> - goto found_middle;
>>> - size -= BITS_PER_LONG;
>>> - result += BITS_PER_LONG;
>>> - }
>>> - while (size & ~(BITS_PER_LONG-1)) {
>>> - if ((tmp = *(p++)))
>>> - goto found_middle;
>>> - result += BITS_PER_LONG;
>>> - size -= BITS_PER_LONG;
>>> - }
>>> - if (!size)
>>> - return result;
>>> - tmp = *p;
>>>
>>> -found_first:
>>> - tmp &= (~0UL >> (BITS_PER_LONG - size));
>>> - if (tmp == 0UL) /* Are any bits set? */
>>> - return result + size; /* Nope. */
>>> -found_middle:
>>> - return result + __ffs(tmp);
>>> + return min(_find_next_bit(addr, size, offset, 1), size);
>> ... and this be part of _find_next_bit? Can find_next_bit not be simply
>> 'return _find_next_bit(addr, size, offset, 1);', and similarly for
>> find_next_zero_bit? Btw., passing true and false for the boolean
>> parameter may be a little clearer.
> I moved size checkers out of '_find_next_bit' to let user call it from his code
> if he knows for sure that size/offset pair is valid. This may help save a couple
> of clocks. I think, I'll walk over the code to find how many such places we have.
> If not too much / not in critical paths, checks may be moved into the function.
But _find_next_bit is static, so outsiders can't call it... The branches
are easily predicted and hence almost free, so I think it's better to do
the code deduplication and move the bounds checking inside
_find_next_bit, so that find_next_bit is literally just 'return
_find_next_bit(addr, size, offset, 0ul);' and find_next_zero_bit is
'return _find_next_bit(addr, size, offset, ~0ul);'.
Rasmus
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 1/3] lib: find_*_bit reimplementation
@ 2015-02-05 23:07 Alexey Klimov
0 siblings, 0 replies; 14+ messages in thread
From: Alexey Klimov @ 2015-02-05 23:07 UTC (permalink / raw)
To: Yury Norov; +Cc: Rasmus Villemoes, linux, Linux Kernel Mailing List
[-- Attachment #1: Type: text/plain, Size: 862 bytes --]
On Mon, Feb 2, 2015 at 3:45 PM, Yury Norov <yury.norov@gmail.com> wrote:
> Alexey,
>
> Yes, ARM has it's own implementation for subj. If you're interested in
> testing
> my patch on your odroid, try this. (Sorry, I have to attach the patch due to
> restrictions on mail agent at office).
Hi Yury,
(please don't drop people from cc; restored mail list cc; re-attach
patch for testing)
As advised please include email [PATCH 0/3] with descriptions and
maybe insert your patch-for-testing there that makes ARM arch to use
generic find_*_bit functions instead of platform ones.
I turn kernel to use generic find_bit and friends functions
implementation on ARMv7 and boot-tested your patch on odroid-xu3
(ARMv7 SoC). Boots and works fine. So if you need my tested-by here it
is:
Tested-by: Alexey Klimov <klimov.linux@gmail.com>
--
Best regards,
Klimov Alexey
[-- Attachment #2: 0001-arm-lib-turn-kernel-to-use-generic-implementation-of.patch --]
[-- Type: text/x-patch, Size: 3389 bytes --]
From 8e17e77e7b20874fea6ac38c5d3102ed56e014c7 Mon Sep 17 00:00:00 2001
From: Yury Norov <y.norov@samsung.com>
Date: Mon, 2 Feb 2015 15:11:07 +0300
Subject: [PATCH] arm: lib: disable platform implementation of 'find_*_bit'
Alexey,
ARM has it's own implementation for subj. If you're interested in testing
my patch on your odroid, try this.
Best regards,
Yury Norov.
---
arch/arm/include/asm/bitops.h | 19 -------------------
arch/arm/kernel/armksyms.c | 11 -----------
arch/arm/lib/Makefile | 2 +-
include/asm-generic/bitops/le.h | 1 +
4 files changed, 2 insertions(+), 31 deletions(-)
diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h
index 5638099..e0611d1 100644
--- a/arch/arm/include/asm/bitops.h
+++ b/arch/arm/include/asm/bitops.h
@@ -192,25 +192,6 @@ extern int _find_next_bit_be(const unsigned long *p, int size, int offset);
#define test_and_clear_bit(nr,p) ATOMIC_BITOP(test_and_clear_bit,nr,p)
#define test_and_change_bit(nr,p) ATOMIC_BITOP(test_and_change_bit,nr,p)
-#ifndef __ARMEB__
-/*
- * These are the little endian, atomic definitions.
- */
-#define find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz)
-#define find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off)
-#define find_first_bit(p,sz) _find_first_bit_le(p,sz)
-#define find_next_bit(p,sz,off) _find_next_bit_le(p,sz,off)
-
-#else
-/*
- * These are the big endian, atomic definitions.
- */
-#define find_first_zero_bit(p,sz) _find_first_zero_bit_be(p,sz)
-#define find_next_zero_bit(p,sz,off) _find_next_zero_bit_be(p,sz,off)
-#define find_first_bit(p,sz) _find_first_bit_be(p,sz)
-#define find_next_bit(p,sz,off) _find_next_bit_be(p,sz,off)
-
-#endif
#if __LINUX_ARM_ARCH__ < 5
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index a88671c..22e8748 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -146,17 +146,6 @@ EXPORT_SYMBOL(_clear_bit);
EXPORT_SYMBOL(_test_and_clear_bit);
EXPORT_SYMBOL(_change_bit);
EXPORT_SYMBOL(_test_and_change_bit);
-EXPORT_SYMBOL(_find_first_zero_bit_le);
-EXPORT_SYMBOL(_find_next_zero_bit_le);
-EXPORT_SYMBOL(_find_first_bit_le);
-EXPORT_SYMBOL(_find_next_bit_le);
-
-#ifdef __ARMEB__
-EXPORT_SYMBOL(_find_first_zero_bit_be);
-EXPORT_SYMBOL(_find_next_zero_bit_be);
-EXPORT_SYMBOL(_find_first_bit_be);
-EXPORT_SYMBOL(_find_next_bit_be);
-#endif
#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_OLD_MCOUNT
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 0573faa..de369aa 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -6,7 +6,7 @@
lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
- delay.o delay-loop.o findbit.o memchr.o memcpy.o \
+ delay.o delay-loop.o memchr.o memcpy.o \
memmove.o memset.o memzero.o setbit.o \
strchr.o strrchr.o \
testchangebit.o testclearbit.o testsetbit.o \
diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h
index 6173154..9a8798f 100644
--- a/include/asm-generic/bitops/le.h
+++ b/include/asm-generic/bitops/le.h
@@ -2,6 +2,7 @@
#define _ASM_GENERIC_BITOPS_LE_H_
#include <asm/types.h>
+#include <asm-generic/bitops/find.h>
#include <asm/byteorder.h>
#if defined(__LITTLE_ENDIAN)
--
1.9.1
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2015-02-05 23:07 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-31 20:58 [PATCH v2 1/3] lib: find_*_bit reimplementation yury.norov
2015-01-31 20:58 ` [PATCH v2 2/3] lib: move find_last_bit to lib/find_next_bit.c yury.norov
2015-01-31 20:58 ` [PATCH v2 3/3] lib: rename lib/find_next_bit.c to lib/find_bit.c yury.norov
2015-02-02 11:09 ` Rasmus Villemoes
2015-02-02 3:17 ` [PATCH v2 1/3] lib: find_*_bit reimplementation George Spelvin
2015-02-04 23:07 ` Yury
2015-02-02 10:43 ` Rasmus Villemoes
2015-02-02 11:47 ` George Spelvin
2015-02-02 12:56 ` Rasmus Villemoes
2015-02-04 23:45 ` Yury
2015-02-05 14:51 ` Rasmus Villemoes
2015-02-04 22:52 ` Yury
2015-02-05 15:01 ` Rasmus Villemoes
2015-02-05 23:07 Alexey Klimov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).