Linux-Fsdevel Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
To: tytso@mit.edu, david@fromorbit.com, bpm@sgi.com, olaf@sgi.com
Cc: linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	kernel@lists.collabora.co.uk, alvaro.soliverez@collabora.co.uk,
	Gabriel Krisman Bertazi <krisman@collabora.co.uk>
Subject: [PATCH RFC 07/13] charsets: utf8: Hook-up utf-8 code to charsets library
Date: Fri, 12 Jan 2018 05:12:28 -0200	[thread overview]
Message-ID: <20180112071234.29470-8-krisman@collabora.co.uk> (raw)
In-Reply-To: <20180112071234.29470-1-krisman@collabora.co.uk>

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
---
 lib/charsets/Makefile    |   2 +-
 lib/charsets/utf8_core.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 lib/charsets/utf8_core.c

diff --git a/lib/charsets/Makefile b/lib/charsets/Makefile
index 95389c4193b0..5e2fa7c20a47 100644
--- a/lib/charsets/Makefile
+++ b/lib/charsets/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_CHARSETS) += charsets.o
 
 obj-$(CONFIG_CHARSETS) += ascii.o
 
-utf8-y += utf8norm.o
+utf8-y += utf8_core.o utf8norm.o
 obj-$(CONFIG_UTF8_NORMALIZATION) +=  utf8.o
 
 $(obj)/utf8norm.o: $(obj)/utf8data.h
diff --git a/lib/charsets/utf8_core.c b/lib/charsets/utf8_core.c
new file mode 100644
index 000000000000..94427670e96e
--- /dev/null
+++ b/lib/charsets/utf8_core.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2017 Collabora Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/charsets.h>
+#include <linux/utf8norm.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/string.h>
+
+static int utf8_strncmp(const struct charset *charset, const char *str1,
+			const char *str2, int len)
+{
+	const struct utf8data *data = utf8nfkdi(charset->version);
+	struct utf8cursor cur1, cur2;
+	unsigned char c1, c2;
+	int r, i;
+
+	r = utf8cursor(&cur1, data, str1);
+	if (r < 0)
+		return -EIO;
+	r = utf8cursor(&cur2, data, str2);
+	if (r < 0)
+		return -EIO;
+
+	for (i = 0 ; i < len ; i++) {
+		c1 = utf8byte(&cur1);
+		c2 = utf8byte(&cur2);
+
+		if (!c1 || !c2 || c1 != c2)
+			return 1;
+
+	}
+
+	return 0;
+}
+
+static int utf8_strncasecmp(const struct charset *charset, const char *str1,
+			    const char *str2, int len)
+{
+	const struct utf8data *data = utf8nfkdicf(charset->version);
+	struct utf8cursor cur1, cur2;
+	unsigned char c1, c2;
+	int r, i;
+
+	r = utf8cursor(&cur1, data, str1);
+	if (r < 0)
+		return -EIO;
+
+	r = utf8cursor(&cur2, data, str2);
+	if (r < 0)
+		return -EIO;
+
+	for (i = 0 ; i < len ; i++) {
+		c1 = utf8byte(&cur1);
+		c2 = utf8byte(&cur2);
+
+		if (!c1 || !c2 || c1 != c2)
+			return 1;
+	}
+
+	return 0;
+}
+
+int utf8_casefold(const struct charset *charset, const char *str, int len,
+		  char **folded_str)
+{
+	const struct utf8data *data = utf8nfkdicf(charset->version);
+	struct utf8cursor cur;
+	int i;
+	char buffer[1024];
+
+	if (utf8cursor(&cur, data, str))
+		return -EIO;
+
+	for (i = 0; i < (1024-1); i++) {
+		buffer[i] = utf8byte(&cur);
+		if (!buffer[i])
+			break;
+	}
+	buffer[i] = '\0';
+	*folded_str = kstrdup(buffer, GFP_NOFS);
+	if (!*folded_str)
+		return -ENOMEM;
+
+	return i;
+}
+
+int utf8_normalize(const struct charset *charset, const char *str, int len,
+		   char **normalization)
+{
+	const struct utf8data *data = utf8nfkdi(charset->version);
+	struct utf8cursor cur;
+	int i;
+	char buffer[1024];
+
+	if (utf8cursor(&cur, data, str))
+		return -EIO;
+
+	for (i = 0; i < (1024-1); i++) {
+		buffer[i] = utf8byte(&cur);
+		if (!buffer[i])
+			break;
+	}
+	buffer[i] = '\0';
+	*normalization = kstrdup(buffer, GFP_NOFS);
+	if (!*normalization)
+		return -ENOMEM;
+
+	return i;
+}
+
+static const struct charset_ops utf8_ops = {
+	.strncmp = utf8_strncmp,
+	.strncasecmp = utf8_strncasecmp,
+	.casefold = utf8_casefold,
+	.normalize = utf8_normalize,
+};
+
+static struct charset *utf8_load_charset(void *pargs)
+{
+	int maj, min, rev;
+	unsigned int age;
+	struct charset *charset;
+	substring_t *args = pargs;
+
+	if (match_int(&args[0], &maj) || match_int(&args[1], &min) ||
+	    match_int(&args[2], &rev))
+		return NULL;
+
+	age = UNICODE_AGE(maj, min, rev);
+
+	if (!utf8version_is_supported(age))
+		return NULL;
+
+	charset = kmalloc(sizeof(struct charset), GFP_KERNEL);
+	if (!charset)
+		return NULL;
+
+	charset->info = NULL;
+	charset->version = age;
+	charset->ops = &utf8_ops;
+
+	return charset;
+}
+
+static struct charset_info utf8_info = {
+	.name = "utf8",
+	.match_token = "utf8-%d.%d.%d",
+	.load_charset = utf8_load_charset,
+};
+
+static int __init init_utf8(void)
+{
+	charset_register(&utf8_info);
+	return 0;
+}
+
+static void __exit exit_utf8(void)
+{
+}
+
+module_init(init_utf8);
+module_exit(exit_utf8);
+MODULE_AUTHOR("Gabriel Krisman Bertazi");
+MODULE_DESCRIPTION("UTF-8 charset operations for filesystems");
+MODULE_LICENSE("GPL");
+
-- 
2.15.1

  parent reply	other threads:[~2018-01-12  7:13 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-01-12  7:12 [PATCH RFC 00/13] UTF-8 case insensitive lookups for EXT4 Gabriel Krisman Bertazi
2018-01-12  7:12 ` [PATCH RFC 01/13] charsets: Introduce middle-layer for character encoding Gabriel Krisman Bertazi
2018-01-12  7:12 ` [PATCH RFC 02/13] charsets: ascii: Wrap ascii functions to charsets library Gabriel Krisman Bertazi
2018-01-12  7:12 ` [PATCH RFC 03/13] charsets: utf8: Add unicode character database files Gabriel Krisman Bertazi
2018-01-12 16:59   ` Darrick J. Wong
2018-01-12 20:29     ` Weber, Olaf (HPC Data Management & Storage)
2018-01-13  0:24   ` Theodore Ts'o
2018-01-13  4:28     ` Gabriel Krisman Bertazi
2018-01-12  7:12 ` [PATCH RFC 04/13] scripts: add trie generator for UTF-8 Gabriel Krisman Bertazi
2018-01-12  7:12 ` [PATCH RFC 05/13] charsets: utf8: Introduce code for UTF-8 normalization Gabriel Krisman Bertazi
2018-01-12  7:12 ` [PATCH RFC 06/13] charsets: utf8: reduce the size of utf8data[] Gabriel Krisman Bertazi
2018-01-12  7:12 ` Gabriel Krisman Bertazi [this message]
2018-01-12 10:38   ` [PATCH RFC 07/13] charsets: utf8: Hook-up utf-8 code to charsets library Weber, Olaf (HPC Data Management & Storage)
2018-01-16 16:50     ` Gabriel Krisman Bertazi
2018-01-16 22:19       ` Weber, Olaf (HPC Data Management & Storage)
2018-01-23  3:33         ` Gabriel Krisman Bertazi
2018-01-12  7:12 ` [PATCH RFC 08/13] charsets: utf8: Introduce test module for kernel UTF-8 implementation Gabriel Krisman Bertazi
2018-01-12  7:12 ` [PATCH RFC 09/13] ext4: Add ignorecase mount option Gabriel Krisman Bertazi
2018-01-12  7:12 ` [PATCH RFC 10/13] ext4: Include encoding information on the superblock Gabriel Krisman Bertazi
2018-01-12  7:12 ` [PATCH RFC 11/13] fscrypt: Introduce charset-based matching functions Gabriel Krisman Bertazi
2018-01-12  7:12 ` [PATCH RFC 12/13] ext4: Support charset name matching Gabriel Krisman Bertazi
2018-01-12  7:12 ` [PATCH RFC 13/13] ext4: Implement ext4 dcache hooks for custom charsets Gabriel Krisman Bertazi
2018-01-12 10:52   ` Weber, Olaf (HPC Data Management & Storage)
2018-01-12 16:56 ` [PATCH RFC 00/13] UTF-8 case insensitive lookups for EXT4 Jeremy Allison

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180112071234.29470-8-krisman@collabora.co.uk \
    --to=krisman@collabora.co.uk \
    --cc=alvaro.soliverez@collabora.co.uk \
    --cc=bpm@sgi.com \
    --cc=david@fromorbit.com \
    --cc=kernel@lists.collabora.co.uk \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=olaf@sgi.com \
    --cc=tytso@mit.edu \
    --subject='Re: [PATCH RFC 07/13] charsets: utf8: Hook-up utf-8 code to charsets library' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).