c语言实现解析转义字符串

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
static const char *parse_string(const char *str)
{
const char *ptr=str+1;char *ptr2;char *out;int len=0;unsigned uc,uc2;
if (*str!='\"') {return 0;} /* not a string! */

while (*ptr!='\"' && *ptr && ++len) if (*ptr++ == '\\') ptr++; /* Skip escaped quotes. */

out=(char*)malloc(len+1); /* This is how long we need for the string, roughly. */
if (!out) return 0;

ptr=str+1;ptr2=out;
while (*ptr!='\"' && *ptr)
{
if (*ptr!='\\') *ptr2++=*ptr++;
else
{
ptr++;
switch (*ptr)
{
case 'b': *ptr2++='\b'; break;
case 'f': *ptr2++='\f'; break;
case 'n': *ptr2++='\n'; break;
case 'r': *ptr2++='\r'; break;
case 't': *ptr2++='\t'; break;
case 'u': /* transcode utf16 to utf8. */
sscanf(ptr+1,"%4x",&uc);ptr+=4; /* get the unicode char. */

if ((uc>=0xDC00 && uc<=0xDFFF) || uc==0) break; /* check for invalid. */

if (uc>=0xD800 && uc<=0xDBFF) /* UTF16 surrogate pairs. */
{
if (ptr[1]!='\\' || ptr[2]!='u') break; /* missing second-half of surrogate. */
sscanf(ptr+3,"%4x",&uc2);ptr+=6;
if (uc2<0xDC00 || uc2>0xDFFF) break; /* invalid second-half of surrogate. */
uc=0x10000 + (((uc&0x3FF)<<10) | (uc2&0x3FF));
}

len=4;if (uc<0x80) len=1;else if (uc<0x800) len=2;else if (uc<0x10000) len=3; ptr2+=len;

switch (len) {
case 4: *--ptr2 =((uc | 0x80) & 0xBF); uc >>= 6;
case 3: *--ptr2 =((uc | 0x80) & 0xBF); uc >>= 6;
case 2: *--ptr2 =((uc | 0x80) & 0xBF); uc >>= 6;
case 1: *--ptr2 =(uc | firstByteMark[len]);
}
ptr2+=len;
break;
default: *ptr2++=*ptr; break;
}
ptr++;
}
}
*ptr2=0;
if (*ptr=='\"') ptr++;
return out;
}
int main(void) {
printf("%s\r\n","hh\r\nh");
return 0;
}