-
-
Notifications
You must be signed in to change notification settings - Fork 421
Make encode reusable in Phobos #2404
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -425,61 +425,98 @@ dchar decode(in dchar[] s, ref size_t idx) | |
return c; // dummy return | ||
} | ||
|
||
|
||
/* =================== Encode ======================= */ | ||
|
||
/******************************* | ||
* Encodes character c and appends it to array s[]. | ||
/** | ||
* Encodes `c` into the static array `buf`. | ||
* | ||
* Params: | ||
* buf = destination of encoded character | ||
* c = character to encode | ||
* | ||
* Returns: | ||
* The length of the encoded character (a number between `1` and `4` for | ||
* `char[4]` buffers and a number between `1` and `2` for `wchar[2]` buffers) | ||
* or `0` in case of failure. | ||
*/ | ||
@safe pure nothrow | ||
void encode(ref char[] s, dchar c) | ||
in | ||
@nogc nothrow pure @safe | ||
size_t encode(out char[4] buf, dchar c) | ||
in | ||
{ | ||
assert(isValidDchar(c)); | ||
} | ||
do | ||
{ | ||
if (c <= 0x7F) | ||
{ | ||
assert(isValidDchar(c)); | ||
buf[0] = cast(char) c; | ||
return 1; | ||
} | ||
do | ||
else if (c <= 0x7FF) | ||
{ | ||
char[] r = s; | ||
|
||
if (c <= 0x7F) | ||
{ | ||
r ~= cast(char) c; | ||
} | ||
else | ||
{ | ||
char[4] buf; | ||
uint L; | ||
buf[0] = cast(char)(0xC0 | (c >> 6)); | ||
buf[1] = cast(char)(0x80 | (c & 0x3F)); | ||
return 2; | ||
} | ||
else if (c <= 0xFFFF) | ||
{ | ||
buf[0] = cast(char)(0xE0 | (c >> 12)); | ||
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | ||
buf[2] = cast(char)(0x80 | (c & 0x3F)); | ||
return 3; | ||
} | ||
else if (c <= 0x10FFFF) | ||
{ | ||
buf[0] = cast(char)(0xF0 | (c >> 18)); | ||
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); | ||
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | ||
buf[3] = cast(char)(0x80 | (c & 0x3F)); | ||
return 4; | ||
} | ||
return 0; | ||
} | ||
|
||
if (c <= 0x7FF) | ||
{ | ||
buf[0] = cast(char)(0xC0 | (c >> 6)); | ||
buf[1] = cast(char)(0x80 | (c & 0x3F)); | ||
L = 2; | ||
} | ||
else if (c <= 0xFFFF) | ||
{ | ||
buf[0] = cast(char)(0xE0 | (c >> 12)); | ||
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | ||
buf[2] = cast(char)(0x80 | (c & 0x3F)); | ||
L = 3; | ||
} | ||
else if (c <= 0x10FFFF) | ||
{ | ||
buf[0] = cast(char)(0xF0 | (c >> 18)); | ||
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); | ||
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | ||
buf[3] = cast(char)(0x80 | (c & 0x3F)); | ||
L = 4; | ||
} | ||
else | ||
{ | ||
assert(0); | ||
} | ||
r ~= buf[0 .. L]; | ||
} | ||
s = r; | ||
/// ditto | ||
@nogc nothrow pure @safe | ||
size_t encode(out wchar[2] buf, dchar c) | ||
in | ||
{ | ||
assert(isValidDchar(c)); | ||
} | ||
do | ||
{ | ||
if (c <= 0xFFFF) | ||
{ | ||
buf[0] = cast(wchar) c; | ||
return 1; | ||
} | ||
else if (c <= 0x10FFFF) | ||
{ | ||
buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); | ||
buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); | ||
return 2; | ||
} | ||
return 0; | ||
} | ||
|
||
/** | ||
* Encodes character c and appends it to array s[]. | ||
*/ | ||
nothrow pure @safe | ||
void encode(ref char[] s, dchar c) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not a fan of the following functions because:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly: #2404 (comment). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, is this the right order of arguments for UFCS? Would it be better to pass There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The sink normally is the first argument, as in:
|
||
in | ||
{ | ||
assert(isValidDchar(c)); | ||
} | ||
do | ||
{ | ||
char[4] buf; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
size_t L = encode(buf, c); | ||
assert(L); // If L is 0, then encode has failed | ||
s ~= buf[0 .. L]; | ||
} | ||
|
||
/// | ||
unittest | ||
{ | ||
debug(utf) printf("utf.encode.unittest\n"); | ||
|
@@ -499,43 +536,32 @@ unittest | |
assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); | ||
} | ||
|
||
/** ditto */ | ||
@safe pure nothrow | ||
/// ditto | ||
nothrow pure @safe | ||
void encode(ref wchar[] s, dchar c) | ||
in | ||
{ | ||
assert(isValidDchar(c)); | ||
} | ||
do | ||
{ | ||
wchar[] r = s; | ||
|
||
if (c <= 0xFFFF) | ||
{ | ||
r ~= cast(wchar) c; | ||
} | ||
else | ||
{ | ||
wchar[2] buf; | ||
|
||
buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); | ||
buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); | ||
r ~= buf; | ||
} | ||
s = r; | ||
} | ||
in | ||
{ | ||
assert(isValidDchar(c)); | ||
} | ||
do | ||
{ | ||
wchar[2] buf; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
size_t L = encode(buf, c); | ||
assert(L); | ||
s ~= buf[0 .. L]; | ||
} | ||
|
||
/** ditto */ | ||
@safe pure nothrow | ||
/// ditto | ||
nothrow pure @safe | ||
void encode(ref dchar[] s, dchar c) | ||
in | ||
{ | ||
assert(isValidDchar(c)); | ||
} | ||
do | ||
{ | ||
s ~= c; | ||
} | ||
in | ||
{ | ||
assert(isValidDchar(c)); | ||
} | ||
do | ||
{ | ||
s ~= c; | ||
} | ||
|
||
/** | ||
Returns the code length of $(D c) in the encoding using $(D C) as a | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great idea. Let's define this function for all inputs (no contracts, no
assert(isValidDChar)
) and have it return 0 upon bad dchar. Then higher-level functions can throw exceptions, use replacement char etc.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Additionally, instead of returning a length, return a slice to the buffer, or
null
on baddchar
, because the next thing people will need to do is slice that buffer.Returning
null
and not an empty slice is important, because otherwiseif (auto s = encode(...))
will always betrue
.Lastly, since this piece of code is highly performance sensitive, did you inspect the assembly to make sure
out
does not get default-initialized on function entry ?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't it better to pass
buf
as a regularchar[]
to push the allocation choice to the caller? If the caller wants to pass a static array, the array can be sliced to be able to pass it aschar[]
.