From: Will DeWitt Jr.
Subject: Fast strlen routine?
NewsGroup: borland.public.delphi.language.basm
Date Posted: -May- at :: PST Download from Google
I've been tinkering with re-writing some of the standard C run-time
library routines and haven't really played much with MMX instructions,
or SSE for that matter. But I thought what I came up with was
interesting and maybe worth sharing-- function strlenmmx(s: PAnsiChar): longword; register;
asm
TEST EAX, EAX
JZ @@Error PXOR MM1, MM1
MOV ECX, EAX // save original pointer
@@:
MOVQ MM0, [EAX] // grab chars
PCMPEQB MM0, MM1 // check all for null/ ( = null, FF = not null - for each char in MM0)
PMOVMSKB EDX, MM0 // move -bit mask of each char to DL
ADD EAX, // move pointer forward chars
TEST EDX, EDX // check for any null/ chars
JNZ @@ MOVQ MM0, [EAX] // unroll twice (#)
PCMPEQB MM0, MM1
PMOVMSKB EDX, MM0
ADD EAX,
TEST EDX, EDX
JNZ @@ MOVQ MM0, [EAX] // (#)
PCMPEQB MM0, MM1
PMOVMSKB EDX, MM0
ADD EAX,
TEST EDX, EDX
JZ @@ @@:
EMMS
BSF EDX, EDX
SUB EAX, DWORD PTR [@@SubTable+EDX*]
SUB EAX, ECX
RET
@@SubTable:
DD
DD
DD
DD
DD
DD
DD
DD
DD
@@Error:
end;
function _PCharLen(P: _PAnsiChr): Longint;
{$IFNDEF LEGACY_PCHARLEN}
begin
Result := ;
if P <> nil then
while P[Result] <> # do
Inc(Result);
end;
{$ELSE !LEGACY_PCHARLEN}
{$IFDEF CPUX86}
asm
TEST EAX,EAX
JE @@
PUSH EAX
XOR ECX,ECX
@@: CMP CL,[EAX+]
JE @@
CMP CL,[EAX+]
JE @@
CMP CL,[EAX+]
JE @@
CMP CL,[EAX+]
JE @@
ADD EAX,
JMP @@
@@: INC EAX
@@: INC EAX
@@: INC EAX
@@: POP ECX
SUB EAX,ECX
@@:
end;
{$ENDIF CPUX86}
{$ENDIF !LEGACY_PCHARLEN}

http://www.verydemo.com/demo_c230_i66795.html

/* 下面是库函数中strlen的实现,比想像的要复杂  */
size_t strlen (str)
const char *str;
{
const char *char_ptr;
const unsigned long int *longword_ptr;
unsigned long int longword, himagic, lomagic; for (char_ptr = str; ((unsigned long int) char_ptr & (sizeof (longword) - )) != ;
++char_ptr)
if (*char_ptr == '\0')
return char_ptr - str; longword_ptr = (unsigned long int *) char_ptr; himagic = 0x80808080L;
lomagic = 0x01010101L; /* Instead of the traditional loop which tests each character,
we will test a longword at a time. The tricky part(棘手的部分) is testing
if *any of the four* bytes in the longword in question are zero. */
for (;;)
{
longword = *longword_ptr++; if (((longword - lomagic) & ~longword & himagic) != )
{
/* 关键在于如果有0,就一定要测试出来,误判没关系 */
/* 只是读,并没有写,不会出现段错误 */
const char *cp = (const char *) (longword_ptr - ); /* 减一是因为前面已经加了1 */
if (cp[] == )
return cp - str;
if (cp[] == )
return cp - str + ;
if (cp[] == )
return cp - str + ;
if (cp[] == )
return cp - str + ;
if (sizeof (longword) > )
{
if (cp[] == )
return cp - str + ;
if (cp[] == )
return cp - str + ;
if (cp[] == )
return cp - str + ;
if (cp[] == )
return cp - str + ;
}
}
}
}
int i;

while (*str++ != '\0') ++i;

return i;

http://www.strchr.com/optimized_strlen_function

http://www.strchr.com/sse2_optimised_strlen

size_t strlen(const char * str)
{
const char *s;
for (s = str; *s; ++s) {}
return(s - str);
}
size_t strlen(const char *s) {
const char *start = s;
while(*s)
s++;
return s - start;
}
// for x86 only
size_t my_strlen(const char *s) {
size_t len = ;
for(;;) {
unsigned x = *(unsigned*)s;
if((x & 0xFF) == ) return len;
if((x & 0xFF00) == ) return len + ;
if((x & 0xFF0000) == ) return len + ;
if((x & 0xFF000000) == ) return len + ;
s += , len += ;
}
}
#ifndef WORDS_BIGENDIAN
#if 0
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
{
register int i = ;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
return i;
}
#elif 0
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
{
// http://www.hackersdelight.org/: ntz3() shortened for 16-bit mask by Peter Kankowski
register int n = ;
if ((x & 0x000000FFU) == ) {n += ; x >>= ;}
if ((x & 0x0000000FU) == ) {n += ; x >>= ;}
if ((x & 0x00000003U) == ) {n += ; x >>= ;}
return n - (x & );
}
#else
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes, by Nazo, post: 2009/07/20 03:40
{ // this is current winner for speed
static const unsigned char table[] =
{
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
, , , , , , , , , , , , , , , ,
};
if ((unsigned char)x)
return table[(unsigned char)x];
return table[x >> ] + ; // t[x / 256] + 8
}
#endif
#else
#if 0
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
{
register int i = ;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
else return i;
if (!(x & ( << ))) i ++;
return i;
}
#else
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
{
// http://www.hackersdelight.org/: nlz1() shortened for 16-bit mask
register int n = ;
if (x <= 0x000000FFU) {n = n + ; x = x << ;}
if (x <= 0x00000FFFU) {n = n + ; x = x << ;}
if (x <= 0x00003FFFU) {n = n + ; x = x << ;}
if (x <= 0x00007FFFU) {n = n + ;}
return n;
}
#endif
#endif
size_t strlen(const char *str)
{
register size_t len = ;
// align to 16 bytes
while ((((intptr_t)str) & (sizeof(__m128i)-)) != )
{
if (*str++ == )
return len;
++ len;
}
// search for 0
__m128i xmm0 = _mm_setzero_si128();
__m128i xmm1;
int mask = ;
for (;;)
{
xmm1 = _mm_load_si128((__m128i *)str);
xmm1 = _mm_cmpeq_epi8(xmm1, xmm0);
if ((mask = _mm_movemask_epi8(xmm1)) != )
{
// got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask
// find index of first set bit #ifndef _DISABLE_ASM_BSF // define it to disable ASM
#if (_MSC_VER >= 1300) // make sure <intrin.h> is included
unsigned long pos;
_BitScanForward(&pos, mask);
len += (size_t)pos;
#elif defined(_MSC_VER) // earlier MSVC's do not have _BitScanForward, use inline asm
__asm bsf edx, mask ; edx = bsf(mask)
__asm add edx, len ; edx += len
__asm mov len, edx ; len = edx
#elif ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))) // modern GCC has built-in __builtin_ctz
len += __builtin_ctz(mask);
#elif defined(__GNUC__) // older GCC shall use inline asm
unsigned int pos;
asm("bsf %1, %0" : "=r" (pos) : "rm" (mask));
len += (size_t)pos;
#else // none of choices exist, use local BSF implementation
len += count_bits_to_0(mask);
#endif
#else
len += count_bits_to_0(mask);
#endif break;
}
str += sizeof(__m128i);
len += sizeof(__m128i);
}
return len;
}

This implementation would win more performance boost if 'count_bits_to_0' is optimised in less conditions.

We could use _mm_loadu_si128 to load unaligned data and thus skip own aligning loop but the performance

will still be worse due to additional CPU cycles if _mm_loadu_si128 is used.

SSE2 SIMD instructions are present on all modern CPUs and thus this implementation

may bring real benefits to intensive database/text processing applications.

License: Public Domain.

http://stackoverflow.com/questions/2372315/how-to-implement-strlen-as-fast-as-possible

also do two micro-optimizations:

  • Since most strings we use scan consist of ASCII chars in the range 0~127, the 
    high bit is (almost) never set, so only check for it in a second test.

  • Increment an index rather than a pointer,
    which is cheaper on some architectures (notably x86) and give you the length for 'free'...

uint32_t gatopeich_strlen32(const char* str)
{
uint32_t *u32 = (uint32_t*)str, u, abcd, i=;
while()
{
u = u32[i++];
abcd = (u-0x01010101) & 0x80808080;
if (abcd && // If abcd is not 0, we have NUL or a non-ASCII char > 127...
(abcd &= ~u)) // ... Discard non-ASCII chars
{
#if BYTE_ORDER == BIG_ENDIAN
return *i - (abcd&0xffff0000 ? (abcd&0xff000000?:) : abcd&0xff00?:);
#else
return *i - (abcd&0xffff ? (abcd&0xff?:) : abcd&0xff0000?:);
#endif
}
}
}

http://www.opensource.apple.com/source/Libc/Libc-997.1.1/string/FreeBSD/strlen.c

strlen.c   [plain text]
/*-
* Copyright (c) 2009 Xin LI <delphij@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/ #include <sys/cdefs.h>
__FBSDID("$FreeBSD: src/lib/libc/string/strlen.c,v 1.7 2009/01/26 07:31:28 delphij Exp $"); #include <limits.h>
#include <sys/types.h>
#include <string.h> /*
* Portable strlen() for 32-bit and 64-bit systems.
*
* Rationale: it is generally much more efficient to do word length
* operations and avoid branches on modern computer systems, as
* compared to byte-length operations with a lot of branches.
*
* The expression:
*
* ((x - 0x01....01) & ~x & 0x80....80)
*
* would evaluate to a non-zero value iff any of the bytes in the
* original word is zero. However, we can further reduce ~1/3 of
* time if we consider that strlen() usually operate on 7-bit ASCII
* by employing the following expression, which allows false positive
* when high bit of 1 and use the tail case to catch these case:
*
* ((x - 0x01....01) & 0x80....80)
*
* This is more than 5.2 times as fast as the raw implementation on
* Intel T7300 under long mode for strings longer than word length.
*/ /* Magic numbers for the algorithm */
#if LONG_BIT == 32
static const unsigned long mask01 = 0x01010101;
static const unsigned long mask80 = 0x80808080;
#elif LONG_BIT == 64
static const unsigned long mask01 = 0x0101010101010101;
static const unsigned long mask80 = 0x8080808080808080;
#else
#error Unsupported word size
#endif #define LONGPTR_MASK (sizeof(long) - 1) /*
* Helper macro to return string length if we caught the zero
* byte.
*/
#define testbyte(x) \
do { \
if (p[x] == '\0') \
return (p - str + x); \
} while () size_t
strlen(const char *str)
{
const char *p;
const unsigned long *lp; /* Skip the first few bytes until we have an aligned p */
for (p = str; (uintptr_t)p & LONGPTR_MASK; p++)
if (*p == '\0')
return (p - str); /* Scan the rest of the string using word sized operation */
for (lp = (const unsigned long *)p; ; lp++)
if ((*lp - mask01) & mask80) {
p = (const char *)(lp);
testbyte();
testbyte();
testbyte();
testbyte();
#if (LONG_BIT >= 64)
testbyte();
testbyte();
testbyte();
testbyte();
#endif
} /* NOTREACHED */
return ();
}

http://www.stdlib.net/~colmmacc/strlen.c.html

   /* Copyright (C) 1991, 1993, 1997, 2000, 2003 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Torbjorn Granlund (tege@sics.se),
4 with help from Dan Sahlin (dan@sics.se);
5 commentary by Jim Blandy (jimb@ai.mit.edu).
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, write to the Free
19 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20 02111-1307 USA. */ #include <string.h>
#include <stdlib.h> #undef strlen /* Return the length of the null-terminated string STR. Scan for
28 the null terminator quickly by testing four bytes at a time. */
size_t
strlen (str)
const char *str;
{
const char *char_ptr;
const unsigned long int *longword_ptr;
unsigned long int longword, magic_bits, himagic, lomagic; /* Handle the first few characters by reading one character at a time.
38 Do this until CHAR_PTR is aligned on a longword boundary. */
for (char_ptr = str; ((unsigned long int) char_ptr
& (sizeof (longword) - )) != ;
++char_ptr)
if (*char_ptr == '\0')
return char_ptr - str; /* All these elucidatory comments refer to 4-byte longwords,
46 but the theory applies equally well to 8-byte longwords. */ longword_ptr = (unsigned long int *) char_ptr; /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits
51 the "holes." Note that there is a hole just to the left of
52 each byte, with an extra at the end:
53
54 bits: 01111110 11111110 11111110 11111111
55 bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
56
57 The 1-bits make sure that carries propagate to the next 0-bit.
58 The 0-bits provide holes for carries to fall into. */
magic_bits = 0x7efefeffL;
himagic = 0x80808080L;
lomagic = 0x01010101L;
if (sizeof (longword) > )
{
/* 64-bit version of the magic. */
/* Do the shift in two steps to avoid a warning if long has 32 bits. */
magic_bits = ((0x7efefefeL << ) << ) | 0xfefefeffL;
himagic = ((himagic << ) << ) | himagic;
lomagic = ((lomagic << ) << ) | lomagic;
}
if (sizeof (longword) > )
abort (); /* Instead of the traditional loop which tests each character,
74 we will test a longword at a time. The tricky part is testing
75 if *any of the four* bytes in the longword in question are zero. */
for (;;)
{
/* We tentatively exit the loop if adding MAGIC_BITS to
79 LONGWORD fails to change any of the hole bits of LONGWORD.
80
81 1) Is this safe? Will it catch all the zero bytes?
82 Suppose there is a byte with all zeros. Any carry bits
83 propagating from its left will fall into the hole at its
84 least significant bit and stop. Since there will be no
85 carry from its most significant bit, the LSB of the
86 byte to the left will be unchanged, and the zero will be
87 detected.
88
89 2) Is this worthwhile? Will it ignore everything except
90 zero bytes? Suppose every byte of LONGWORD has a bit set
91 somewhere. There will be a carry into bit 8. If bit 8
92 is set, this will carry into bit 16. If bit 8 is clear,
93 one of bits 9-15 must be set, so there will be a carry
94 into bit 16. Similarly, there will be a carry into bit
95 24. If one of bits 24-30 is set, there will be a carry
96 into bit 31, so all of the hole bits will be changed.
97
98 The one misfire occurs when bits 24-30 are clear and bit
99 31 is set; in this case, the hole at bit 31 is not
100 changed. If we had access to the processor carry flag,
101 we could close this loophole by putting the fourth hole
102 at bit 32!
103
104 So it ignores everything except 128's, when they're aligned
105 properly. */ longword = *longword_ptr++; if (
#if 0
/* Add MAGIC_BITS to LONGWORD. */
(((longword + magic_bits) /* Set those bits that were unchanged by the addition. */
^ ~longword) /* Look at only the hole bits. If any of the hole bits
118 are unchanged, most likely one of the bytes was a
119 zero. */
& ~magic_bits)
#else
((longword - lomagic) & himagic)
#endif
!= )
{
/* Which of the bytes was the zero? If none of them were, it was
127 a misfire; continue the search. */ const char *cp = (const char *) (longword_ptr - ); if (cp[] == )
return cp - str;
if (cp[] == )
return cp - str + ;
if (cp[] == )
return cp - str + ;
if (cp[] == )
return cp - str + ;
if (sizeof (longword) > )
{
if (cp[] == )
return cp - str + ;
if (cp[] == )
return cp - str + ;
if (cp[] == )
return cp - str + ;
if (cp[] == )
return cp - str + ;
}
}
}
}
libc_hidden_builtin_def (strlen)
     /* Copyright (C) 2011-2014 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */ #include <string.h>
#include <stdint.h>
#include "string-endian.h" size_t
strlen (const char *s)
{
/* Get an aligned pointer. */
const uintptr_t s_int = (uintptr_t) s;
const uint64_t *p = (const uint64_t *) (s_int & -); /* Read and MASK the first word. */
uint64_t v = *p | MASK (s_int); uint64_t bits;
while ((bits = __insn_v1cmpeqi (v, )) == )
v = *++p; return ((const char *) p) + (CFZ (bits) >> ) - s;
}
libc_hidden_builtin_def (strlen)

http://tonybai.com/2009/04/11/glibc-strlen-source-analysis/

直接操作C标准库提供的字符串操作函数是有一定风险的,稍有不慎就会导致内存问题。

这周用业余时间写了一个小型的安全字符串操作库,但是测试之后才发现自己的实现有很大的性能缺陷。

在Solaris上初步做了一个简单的性能比对,以下是得到的性能数据(以strlen的数据为例):
当传入的字符串长度为10时,执行100w次:
strlen 执行时间是:32762毫秒
my_strlen执行时间是:491836毫秒

当传入的字符串长度为20时,执行100w次:
strlen 执行时间是:35075毫秒
my_strlen执行时间是:770397毫秒

很显然,标准库中strlen的消耗仅是my_strlen的十分之一不到,且其性能消耗随着字符串长度的增加并未有近线性的增加,

而my_strlen则是变化明显。想必大家这时也能猜到my_strlen采用了传统的实现的方式,即采用逐个字节判断是否为''方式,

这也与测试出的现象相符。本着刨根问底的精神,我在网上找到了GNU提供的C标准库中strlen实现的源码,

要看看GLIBC中strlen究竟采用何种技巧才达到了那么高的性能。

说实话在性能优化这方面自己一直还处于比较初级的位置,这也将是自己将来努力的一个方向。

下载了全部GLIBC的代码包,这个包还真不小。在string子目录下找到strlen.c,这就是大多数UNIX平台、

Linux平台以及绝大多数GNU软件使用的strlen的实现源码了。

这份代码由Torbjorn Granlund(还实现了memcpy)编写,Jim Blandy和Dan Sahlin提供了帮助和注释。

包括注释在内,GLIBC的strlen的代码足足有近130行,大致浏览一下, 没有怎么看懂,可耐下心来细致阅读,

还是有些心得的。下面是strlen源码摘要版,后面我将针对这段代码写一些我的理解:

 /* Return the length of the null-terminated string STR.  Scan for
2 the null terminator quickly by testing four bytes at a time. */
size_t strlen (str) const char *str;
{
const char *char_ptr;
const unsigned long int *longword_ptr;
unsigned long int longword, magic_bits, himagic, lomagic; /* Handle the first few characters by reading one character at a time.
10 Do this until CHAR_PTR is aligned on a longword boundary. */ for (char_ptr = str; ((unsigned long int) char_ptr
& (sizeof (longword) – )) != ;
++char_ptr)
if (*char_ptr == '')
return char_ptr – str; /* All these elucidatory comments refer to 4-byte longwords,
19 but the theory applies equally well to 8-byte longwords. */ longword_ptr = (unsigned long int *) char_ptr; himagic = 0x80808080L;
lomagic = 0x01010101L; if (sizeof (longword) > )
abort (); /* Instead of the traditional loop which tests each character,
30 we will test a longword at a time. The tricky part is testing
31 if *any of the four* bytes in the longword in question are zero. */ for (;;)
{
longword = *longword_ptr++; if ( ((longword – lomagic) & himagic) != )
{
/* Which of the bytes was the zero? If none of them were, it was
40 a misfire; continue the search. */ const char *cp = (const char *) (longword_ptr – ); if (cp[] == )
return cp – str;
if (cp[] == )
return cp – str + ;
if (cp[] == )
return cp – str + ;
if (cp[] == )
return cp – str + ;
if (sizeof (longword) > )
{
if (cp[] == )
return cp – str + ;
if (cp[] == )
return cp – str + ;
if (cp[] == )
return cp – str + ;
if (cp[] == )
return cp – str + ;
}
}
}
}

从这段代码开头作者的注释我们大致可以了解到该strlen实现的原理:

就是通过每次测试四个字节来代替传统实现中每次测试一个字节的方法。

知道这个原理了,那么还需要解决两个难题:
1) C标准库要求有很好的移植性,在绝大部分系统体系结构下都应该能正确运行。

那么每次拿出4个字节比较(unsigned long int),就需要考虑内存对齐问题,

传入的字符串的首字符地址可不一定在4对齐的地址上;

2) 如何对四个字节进行测试,找出其中某个字节为全0,这是个技巧问题。

12~21行的代码解决的就是第一个问题:

          for (char_ptr = str; ((unsigned long int) char_ptr
& (sizeof (longword) – )) != ;
++char_ptr)
if (*char_ptr == '')
return char_ptr – str; /* All these elucidatory comments refer to 4-byte longwords,
19 but the theory applies equally well to 8-byte longwords. */ longword_ptr = (unsigned long int *) char_ptr;

作者通过一个for-loop找到传入字符串中第一个地址对齐到4的字符的地址,由于该地址已经对齐到4,

所以最后一行那个强制转型是安全的。虽然可以通过圆整算式直接得到该对齐地址,但是考虑到这个区间可能存在的'',

一个字符一个字符比对也是不可避免的。在很多严格对齐的架构上(比如SUN的SPARC平台),

编译器一般会将字符串地址在编译器就放到对齐的地址上,这样一来,实际执行strlen时for-loop很少能执行一步。

第二个问题作者则是通过一个"带前提"的技巧来解决的。作者设定了两个掩码变量:

          himagic = 0x80808080L;
lomagic = 0x01010101L;

并通过一个conditional expression完成了对四字节中全0字节的检测:

((longword – lomagic) & himagic) != 0

我们将himagic和lomagic按bit展开:
himagic   1000 0000 1000 0000 1000 0000 1000 0000
lomagic   0000 0001 0000 0001 0000 0001 0000 0001

对于这样的代码,似乎没有什么理论可以遵循,需要在实践中去理解。

起初我构造了一个不含全0字节的longword,比如:
longword  1000 0001 1000 0001 1000 0001 1000 0001,

然后按照那个条件表达式计算后,居然也满足!=0的条件,是不是作者的逻辑有问题呢?

后来转念一想,这种逻辑是有“前提条件”的。回顾一下strlen是做什么的,其输入参数是任意的么?

当然不是。输入的字符串中每个字符的值都在[0, 127]的ascii码范围内,

也就是说每个字节最高位的bit都是0,这样longword就应该是如下这个样子了:

longword  0xxx xxxx 0xxx xxxx 0xxx xxxx 0xxx xxxx

基于这样的前提我们考虑两种情况:

当longword中没有全0字节时,比如:

longword 0000 0001 0000 0001 0000 0001 0000 0001

这样在做完计算后,值为0,不满足条件。

当longword中有全零字节时,比如:

longword 0000 0000 0000 0001 0000 0001 0000 0001

这样在做完计算后,最高字节最高bit的值肯定为1,满足!=0条件,全0字节被检测出来。

也就是说一旦有全0字节,在减去lomagic时势必会产生借位,全0的那个字节在减去lomagic后最高位bit肯定由0变1,

这样与himagic一与,肯定不为0,就是这么检测出来的。

这一方法在64位平台依然适用,上面的代码摘要中省略了对64bit平台的特殊处理,为的是使代码逻辑更清晰,更易读。

 function CStrLength( CString : PAnsiChar ) : NativeUInt;
var
AnsiCharPtr : PAnsiChar;
NativePtr : PNativeUInt;
Native : NativeUInt;
HiMagic : NativeUInt;
LoMagic : NativeUInt;
MagicBits : NativeUInt;
begin
// Handle the first few characters by reading one character at a time.
// Do this until CStr is aligned on a longword boundary.
AnsiCharPtr := CString;
while NativeUInt( AnsiCharPtr ) and ( sizeof( NativeUInt ) - ) <> do
begin
Inc( AnsiCharPtr );
if AnsiCharPtr^ = # then
begin
Result := NativeUInt( AnsiCharPtr ) - NativeUInt( CString );
Exit;
end;
end; // All these elucidatory comments refer to -byte longwords,
// but the theory applies equally well to -byte longwords.
NativePtr := PNativeUInt( AnsiCharPtr ); (* Bits 31, 24, 16, and 8 of this number are zero. Call these bits
the "holes." Note that there is a hole just to the left of
each byte, with an extra at the end: bits: 01111110 11111110 11111110 11111111
bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD The 1-bits make sure that carries propagate to the next 0-bit.
The 0-bits provide holes for carries to fall into. *)
MagicBits := $7EFEFEFF;
HiMagic := $;
LoMagic := $; // -bit version of the magic.
// Do the shift in two steps to avoid a warning if long has bits.
if sizeof( Native ) > then
begin
MagicBits := ( ( $7EFEFEFE shl ) shl ) or $FEFEFEFF;
HiMagic := ( ( HiMagic shl ) shl ) or HiMagic;
LoMagic := ( ( LoMagic shl ) shl ) or LoMagic;
end; (* We tentatively exit the loop if adding MAGIC_BITS to
LONGWORD fails to change any of the hole bits of LONGWORD. 1) Is this safe? Will it catch all the zero bytes?
Suppose there is a byte with all zeros. Any carry bits
propagating from its left will fall into the hole at its
least significant bit and stop. Since there will be no
carry from its most significant bit, the LSB of the
byte to the left will be unchanged, and the zero will be
detected. 2) Is this worthwhile? Will it ignore everything except
zero bytes? Suppose every byte of LONGWORD has a bit set
somewhere. There will be a carry into bit 8. If bit 8
is set, this will carry into bit 16. If bit 8 is clear,
one of bits 9-15 must be set, so there will be a carry
into bit 16. Similarly, there will be a carry into bit
24. If one of bits 24-30 is set, there will be a carry
into bit 31, so all of the hole bits will be changed. The one misfire occurs when bits 24-30 are clear and bit
31 is set; in this case, the hole at bit 31 is not
changed. If we had access to the processor carry flag,
we could close this loophole by putting the fourth hole
at bit 32! So it ignores everything except 128's, when they're aligned
properly. *) // Instead of the traditional loop which tests each character,
// we will test a longword at a time. The tricky part is testing
// if *any of the four* bytes in the longword in question are zero.
while True do
begin
Native := NativePtr^;
Inc( NativePtr );
// http://stackoverflow.com/questions/2372315/how-to-implement-strlen-as-fast-as-possible
if ( ( Native - LoMagic ) and HiMagic ) <> 0 then : NULL or Byte >= 0x80
if ( ( Native - LoMagic ) and HiMagic and not Native ) <> 0 then : NULL only
begin
Dec( NativePtr ); // Which of the bytes was the zero?
AnsiCharPtr := PAnsiChar( NativePtr );
if AnsiCharPtr[ ] = # then
begin
Result := NativeUInt( AnsiCharPtr ) - NativeUInt( CString );
Exit;
end;
if AnsiCharPtr[ ] = # then
begin
Result := NativeUInt( AnsiCharPtr ) - NativeUInt( CString ) + ;
Exit;
end;
if AnsiCharPtr[ ] = # then
begin
Result := NativeUInt( AnsiCharPtr ) - NativeUInt( CString ) + ;
Exit;
end;
if AnsiCharPtr[ ] = # then
begin
Result := NativeUInt( AnsiCharPtr ) - NativeUInt( CString ) + ;
Exit;
end; if sizeof( Native ) > then
begin
if AnsiCharPtr[ ] = # then
begin
Result := NativeUInt( AnsiCharPtr ) - NativeUInt( CString ) + ;
Exit;
end;
if AnsiCharPtr[ ] = # then
begin
Result := NativeUInt( AnsiCharPtr ) - NativeUInt( CString ) + ;
Exit;
end;
if AnsiCharPtr[ ] = # then
begin
Result := NativeUInt( AnsiCharPtr ) - NativeUInt( CString ) + ;
Exit;
end;
if AnsiCharPtr[ ] = # then
begin
Result := NativeUInt( AnsiCharPtr ) - NativeUInt( CString ) + ;
Exit;
end;
end;
end;
end;
end;

glibc strlen delphi pascal的更多相关文章

  1. Delphi / Pascal 语法知识干货

    ********************************************* Pascal.Delph干货 *************************************** ...

  2. NotePad++ delphi/Pascal函数过程列表插件

    从cnpack上爬下来的 函数过程列表 点击下载

  3. Delphi的分配及释放---New/Dispose, GetMem/FreeMem及其它函数的区别与相同

    转载自:http://www.cnblogs.com/qiusl/p/4028437.html?utm_source=tuicool 我估摸着内存分配+释放是个基础的函数,有些人可能没注意此类函数或细 ...

  4. delphi.memory.分配及释放---New/Dispose, GetMem/FreeMem及其它函数的区别与相同

    我估摸着内存分配+释放是个基础函数,有些人可能没注意此类函数或细究,但我觉得还是弄明白的好. 介绍下面内存函数前,先说一下MM的一些过程,如不关心可忽略: TMemoryManager = recor ...

  5. delphi.位操作

    位操作网上有很多介绍,请上网google/baidu,比如: 位操作技巧实例大全: http://blog.csdn.net/g_spider/article/details/5750665 位操作基 ...

  6. [Delphi]Delphi开发的一些技巧

    一.提高查询效率先进行准备查询操作: CustomerQuery.Close; if not (CustomerQuery.Prepared) then -->查询是否已准备好 Customer ...

  7. delphi的tserversocket控件如何接收16进制数

    http://bbs.csdn.net/topics/390473005 对方客户端发送数据如:68 00 00··········:接收完成后,数据长度没错(13),但是显示接收结果时,只显示一个字 ...

  8. Delphi MDI程序 父窗体如何调用当前活动子窗体的函数/过程

    一个MDI文本文件编辑器打开了N个子窗体子窗体的.pas文件有一些public的过程和函数我想在父窗体调用当前活动的子窗体函数我用Self.ActiveChildForm无法调用直接frmEdit.x ...

  9. Pascal编译器大全(非常难得)

    http://www.pascaland.org/pascall.htm Some titles (french) : Compilateurs Pascal avec sources = compi ...

随机推荐

  1. IClassSchemaEdit修改要素类信息

    private void ChangeFeatureClassAliasName(IFeatureClass pFeatureClass, string aliasName) { ISchemaLoc ...

  2. REST构架风格介绍:状态表述转移

    REST(Representational State Transfer)是HTTP协议的作者Roy Fielding博士在其博士论文中提出的一种互联网应用构架风格.与以远程对象为核心的ORB和以服务 ...

  3. 分享一些Comet开发经验

    前言 本comet技术主要用于数据库持久层的 穿越防火墙 远程访问.只要有一台中继网站,任意地点的数据库都能被访问. Comet概念介绍 WebIM.网页的客服.meebo等大家听说过了.最近还有个兄 ...

  4. Web前端开发工程师编程能力飞升之路

    [背景] 如果你是刚进入web前端研发领域,想试试这潭水有多深,看这篇文章吧:如果你是做了两三年web产品前端研发,迷茫找不着提高之路,看这篇文章吧:如果你是四五年的前端开发高手,没有难题能难得住你的 ...

  5. [Asp.net MVC]Asp.net MVC5系列——Razor语法

    Razor视图引擎是Asp.net MVC3中新扩展的内容,并且也是它的默认视图引擎.还有另外一种Web Forms视图引擎.通过前面的文章可知在Asp.net mvc5中创建视图,默认使用的是Raz ...

  6. Linux 的 screen用法

    screen可以将任务挂起,即将任务放在后台,一般5个任务左右. 1.新建screen会话:直接输入screen命令或者screen -S [会话名称] 2.退出会话:按下组合键Ctrl+a并松开,此 ...

  7. linux nginx安装(转载)

    1.linux 下面安装 1.下载 pcre-8.10.tar.gz  nginx-1.1.1.tar.gz 2.安装 pcre 让nginx支持rewrite pcre-8.10.tar.gz  上 ...

  8. 硬盘类型和Linux的分区

    目前硬盘主要分为以下几种:IDE,SCSI,SATA,SAS.其中SAS(Serial Attached SCSI)即串行连接SCSI,属于SCSI的新一代技术. 以下是Linux常见设备的名称: 装 ...

  9. s3c2440串口裸板驱动(使用fifo)

    使用fifo的好处有: 1:串口的数据发送的数据量较大时,使用fifo可以大大降低MCU的开销.(有点类似串入并出的cput处理模型,本质上还是串行收发) 2:在某些特殊场合,例如制定较复杂的协议时, ...

  10. 编译python3

    安装环境 yum install gcc yum install zlib-devel yum install make 下载python版本 wget http://www.python.org/f ...