From 8f403d38ac1e535a738d7de2a0207f3246d9fc81 Mon Sep 17 00:00:00 2001 From: "yu.dongliang" <18588496441@163.com> Date: Sat, 2 May 2026 14:43:52 +0800 Subject: [PATCH] add scf_utf8_len() to get a UTF-8 char & its bytes --- asm/1.s | 2 +- util/Makefile | 4 +-- util/scf_string.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++- util/scf_string.h | 14 ++++---- 4 files changed, 95 insertions(+), 11 deletions(-) diff --git a/asm/1.s b/asm/1.s index 3e98556..cfe44f0 100644 --- a/asm/1.s +++ b/asm/1.s @@ -14,4 +14,4 @@ main: .org 509 1: call 0b -.asciz "hello world\n" +.asciz "/bin/sh" diff --git a/util/Makefile b/util/Makefile index 5c98436..7449e00 100644 --- a/util/Makefile +++ b/util/Makefile @@ -1,10 +1,10 @@ -#CFILES += scf_string.c +CFILES += scf_string.c #CFILES += scf_list_test.c #CFILES += scf_stack_test.c #CFILES += scf_vector_test.c #CFILES += scf_graph_test.c #CFILES += scf_graph.c -CFILES += scf_rbtree.c +#CFILES += scf_rbtree.c CFLAGS += -g -O3 diff --git a/util/scf_string.c b/util/scf_string.c index 1da6366..ac82cd5 100644 --- a/util/scf_string.c +++ b/util/scf_string.c @@ -148,6 +148,21 @@ int scf_string_copy(scf_string_t* s0, const scf_string_t* s1) return 0; } +int scf_string_copy_cstr_len(scf_string_t* s0, const char* str, size_t len) +{ + scf_string_t s1; + s1.capacity = -1; + s1.len = len; + s1.data = (char*)str; + + return scf_string_copy(s0, &s1); +} + +int scf_string_copy_cstr(scf_string_t* s0, const char* str) +{ + return scf_string_copy_cstr_len(s0, str, strlen(str)); +} + int scf_string_cat(scf_string_t* s0, const scf_string_t* s1) { if (!s0 || !s1 || !s0->data || !s1->data) @@ -329,6 +344,59 @@ int scf_string_get_offset(scf_string_t* str, const char* data, size_t len) return ret; } +int scf_utf8_len(const uint8_t* str, size_t* len) +{ + size_t n = *len; + int c = *str; + int i; + + if (c < 0x80) + n = 1; + else if (0x6 == (c >> 5)) { + c &= 0x1f; + n = 2; + + } else if (0xe == (c >> 4)) { + c &= 0xf; + n = 3; + + } else if (0x1e == (c >> 3)) { + c &= 0x7; + n = 4; + + } else if (0x3e == (c >> 2)) { + c &= 0x3; + n = 5; + + } else if (0x7e == (c >> 1)) { + c &= 0x1; + n = 6; + } else { + scf_loge("utf8 first byte wrong %#x\n", c); + return -EINVAL; + } + + if (n > *len) { + scf_loge("utf8 len error, needs %ld, real: %ld\n", n, *len); + return -EINVAL; + } + + for (i = 1; i < n; i++) { + int c2 = str[i]; + + if (0x2 == (c2 >> 6)) { + c <<= 6; + c |= c2 & 0x3f; + } else { + scf_loge("utf8 byte[%d] wrong %#x\n", i, c2); + return -EINVAL; + } + } + + *len = n; + return c; +} + #if 0 int main(int argc, char* argv[]) { @@ -371,6 +439,23 @@ int main(int argc, char* argv[]) printf("i: %d, offset: %d\n", i, offset); } + uint8_t* utf8 = "你好, hello"; + size_t N = strlen(utf8); + size_t n = N; + + int c = scf_utf8_len(utf8, &n); + printf("c: %.*s:%d, n: %ld\n", (int)n, utf8, c, n); + + utf8 += n; + n = N - n; + c = scf_utf8_len(utf8, &n); + printf("c: %.*s:%d, n: %ld\n", (int)n, utf8, c, n); + + utf8 += n; + n = N - n; + c = scf_utf8_len(utf8, &n); + printf("c: %.*s:%d, n: %ld\n", (int)n, utf8, c, n); + scf_string_free(s0); scf_string_free(s1); scf_string_free(s2); @@ -378,4 +463,3 @@ int main(int argc, char* argv[]) return 0; } #endif - diff --git a/util/scf_string.h b/util/scf_string.h index faf1e2f..082d2b5 100644 --- a/util/scf_string.h +++ b/util/scf_string.h @@ -4,14 +4,13 @@ #include"scf_vector.h" typedef struct { - int capacity; + intptr_t capacity; size_t len; uint8_t* data; } scf_string_t; scf_string_t* scf_string_alloc(); -scf_string_t* scf_string_alloc_len(size_t len); scf_string_t* scf_string_clone(scf_string_t* s); @@ -32,12 +31,13 @@ int scf_string_cmp_cstr(const scf_string_t* s0, const char* str); int scf_string_cmp_cstr_len(const scf_string_t* s0, const char* str, size_t len); int scf_string_copy(scf_string_t* s0, const scf_string_t* s1); +int scf_string_cat (scf_string_t* s0, const scf_string_t* s1); -int scf_string_cat(scf_string_t* s0, const scf_string_t* s1); +int scf_string_copy_cstr(scf_string_t* s0, const char* str); +int scf_string_cat_cstr (scf_string_t* s0, const char* str); -int scf_string_cat_cstr(scf_string_t* s0, const char* str); - -int scf_string_cat_cstr_len(scf_string_t* s0, const char* str, size_t len); +int scf_string_copy_cstr_len(scf_string_t* s0, const char* str, size_t len); +int scf_string_cat_cstr_len (scf_string_t* s0, const char* str, size_t len); int scf_string_match_kmp(const scf_string_t* T, const scf_string_t* P, scf_vector_t* offsets); int scf_string_match_kmp_cstr(const uint8_t* T, const uint8_t* P, scf_vector_t* offsets); @@ -45,5 +45,5 @@ int scf_string_match_kmp_cstr_len(const scf_string_t* T, const uint8 int scf_string_get_offset(scf_string_t* str, const char* data, size_t len); +int scf_utf8_len(const uint8_t* str, size_t* len); #endif - -- 2.25.1