Skip to content

Commit

Permalink
Syntax improvements.
Browse files Browse the repository at this point in the history
  • Loading branch information
RyanLamansky committed Jul 27, 2024
1 parent e5fb0e4 commit 8d98564
Show file tree
Hide file tree
Showing 16 changed files with 92 additions and 136 deletions.
48 changes: 24 additions & 24 deletions HtmlUtilities.Tests/CodePointTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ public static void CodePointsHaveCorrectInfraCategories(int codePoint, CodePoint
Assert.Equal(categories, new CodePoint(codePoint).InfraCategories);
}

public static readonly object?[][] Utf8ValidTestCases = new object?[][] {
new object?[] { null, Array.Empty<CodePoint>() },
new object[] { "$", new CodePoint[] { 0x0024 } },
new object[] { "£", new CodePoint[] { 0x00A3 } },
new object[] { "ह", new CodePoint[] { 0x0939 } },
new object[] { "€", new CodePoint[] { 0x20AC } },
new object[] { "한", new CodePoint[] { 0xd55c } },
new object[] { "𐍈", new CodePoint[] { 0x10348 } },
};
public static readonly object?[][] Utf8ValidTestCases = [
[null, Array.Empty<CodePoint>()],
["$", new CodePoint[] { 0x0024 }],
["£", new CodePoint[] { 0x00A3 }],
["ह", new CodePoint[] { 0x0939 }],
["€", new CodePoint[] { 0x20AC }],
["한", new CodePoint[] { 0xd55c }],
["𐍈", new CodePoint[] { 0x10348 }],
];

[Theory]
[MemberData(nameof(Utf8ValidTestCases))]
Expand All @@ -58,20 +58,20 @@ public static void EncodeUtf8FromEnumerableCodePoints(string? expected, CodePoin
Assert.Equal(expected is null ? "" : expected, UTF8.GetString(CodePoint.EncodeUtf8(value).ToArray()));
}

public static readonly object?[][] Utf16TestCases = new object?[][] {
new object?[] { null, Array.Empty<CodePoint>() },
new object[] { "$", new CodePoint[] { 0x0024 } },
new object[] { "€", new CodePoint[] { 0x20AC } },
new object[] { "𐐷", new CodePoint[] { 0x10437 } },
new object[] { "𤭢", new CodePoint[] { 0x24B62 } },
};
public static readonly object?[][] Utf16TestCases = [
[null, Array.Empty<CodePoint>()],
["$", new CodePoint[] { 0x0024 }],
["€", new CodePoint[] { 0x20AC }],
["𐐷", new CodePoint[] { 0x10437 }],
["𤭢", new CodePoint[] { 0x24B62 }],
];

public static readonly object?[][] Utf16TestCasesWithInvalidCodePoints = new object?[][] {
new object[] { "", new CodePoint[] { 0x110000 } },
new object[] { "$", new CodePoint[] { 0x0024, 0x110000 } },
new object[] { "$$", new CodePoint[] { 0x0024, 0x110000, 0x0024 } },
new object[] { "$", new CodePoint[] { 0x110000, 0x0024 } },
};
public static readonly object?[][] Utf16TestCasesWithInvalidCodePoints = [
["", new CodePoint[] { 0x110000 }],
["$", new CodePoint[] { 0x0024, 0x110000 }],
["$$", new CodePoint[] { 0x0024, 0x110000, 0x0024 }],
["$", new CodePoint[] { 0x110000, 0x0024 }],
];

[Theory]
[MemberData(nameof(Utf16TestCases))]
Expand Down Expand Up @@ -243,8 +243,8 @@ public static void TrySpanFormattable(string expected, int codePoint)
[Fact]
public static void TrySpanFormattableTooShortIsFalse()
{
Assert.False(new CodePoint('T').TryFormat(Span<char>.Empty, out var charsWritten));
Assert.False(new CodePoint(0x24B62).TryFormat(Span<char>.Empty, out charsWritten));
Assert.False(new CodePoint('T').TryFormat([], out var charsWritten));
Assert.False(new CodePoint(0x24B62).TryFormat([], out charsWritten));

Span<char> destination = stackalloc char[1];
Assert.False(new CodePoint(0x24B62).TryFormat(destination, out charsWritten));
Expand Down
2 changes: 1 addition & 1 deletion HtmlUtilities.Tests/HtmlWriterAsyncTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ await HtmlWriter.WriteDocumentAsync(buffer, null, (writer, cancellationToken) =>
{
return writer.WriteElementAsync(new ValidatedElement("body"), children: (writer, cancellationToken) =>
{
return writer.WriteElementAsync(new ValidatedElement("div", new ValidatedAttribute[] { ("id", "react-app") }), attributes => attributes.Write("class", "root"), cancellationToken: cancellationToken);
return writer.WriteElementAsync(new ValidatedElement("div", [("id", "react-app")]), attributes => attributes.Write("class", "root"), cancellationToken: cancellationToken);
}, cancellationToken: cancellationToken);
}).ConfigureAwait(false);

Expand Down
2 changes: 1 addition & 1 deletion HtmlUtilities.Tests/HtmlWriterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ public static void WriteDocumentWithMixedAttributes()
{
writer.WriteElement(new ValidatedElement("body"), children: writer =>
{
writer.WriteElement(new ValidatedElement("div", new ValidatedAttribute[] { new("id", "react-app")}), attributes => attributes.Write("class", "root"));
writer.WriteElement(new ValidatedElement("div", [new("id", "react-app")]), attributes => attributes.Write("class", "root"));
});
});

Expand Down
2 changes: 1 addition & 1 deletion HtmlUtilities.Tests/Validated/ValidatedElementTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,6 @@ public static void ValidatedElementIsCorrect()
[Fact]
public static void ValidatedElementWithAttributesIsCorrect()
{
Assert.Equal("<html lang=en-us>", new ValidatedElement("html", new ValidatedAttribute[] { ("lang", "en-us") }).ToString());
Assert.Equal("<html lang=en-us>", new ValidatedElement("html", [("lang", "en-us")]).ToString());
}
}
36 changes: 19 additions & 17 deletions HtmlUtilities/ArrayBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,15 @@ namespace HtmlUtilities;
/// </summary>
/// <typeparam name="T">The type of array desired.</typeparam>
/// <remarks>This is not part of the public API because it's easy to misuse and I don't want to help everyone fix their bugs.</remarks>
internal ref struct ArrayBuilder<T>
internal ref struct ArrayBuilder<T>(int initialCapacity)
{
private T[] buffer;
private int written;

public ArrayBuilder(int initialCapacity)
{
// Accessing ArrayPool<T>.Shared is inlined by the JIT, which then sees a sealed implementation, allowing removal of the vtable lookups.
// The object returned by ArrayPool<T>.Create cannot get this benefit.
// Reference: https://github.com/dotnet/runtime/blob/main/src/libraries/System.Private.CoreLib/src/System/Buffers/ArrayPool.cs
// Using the shared pool carries risk of malfunction if another user returns a rented array but continues to modify it.
// Microsoft uses ArrayPool<T>.Shared extensively within the .NET runtime, so if they're okay with this risk, I suppose I am, too.
buffer = ArrayPool<T>.Shared.Rent(initialCapacity);
written = 0;
}
// Accessing ArrayPool<T>.Shared is inlined by the JIT, which then sees a sealed implementation, allowing removal of the v-table lookups.
// The object returned by ArrayPool<T>.Create cannot get this benefit.
// Reference: https://github.com/dotnet/runtime/blob/main/src/libraries/System.Private.CoreLib/src/System/Buffers/ArrayPool.cs
// Using the shared pool carries risk of malfunction if another user returns a rented array but continues to modify it.
// Microsoft uses ArrayPool<T>.Shared extensively within the .NET runtime, so if they're okay with this risk, I suppose I am, too.
private T[] buffer = ArrayPool<T>.Shared.Rent(initialCapacity);
private int written = 0;

private void Grow(int amount)
{
Expand Down Expand Up @@ -59,16 +53,24 @@ public void Write(ReadOnlySpan<T> values)
written += length;
}

internal T[] Buffer => buffer;
internal readonly T[] Buffer => buffer;

internal readonly ReadOnlyMemory<T> WrittenMemory => buffer.AsMemory(0, written);

public readonly ReadOnlySpan<T> WrittenSpan => buffer.AsSpan(0, written);

public readonly T[] ToArray() => WrittenSpan.ToArray();

public readonly void Release()
public readonly void Release() => ArrayPool<T>.Shared.Return(buffer);
}

internal static class ArrayBuilderExtensions
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void Write(ref this ArrayBuilder<byte> arrayBuilder, char c)
{
ArrayPool<T>.Shared.Return(buffer);
System.Diagnostics.Debug.Assert(c <= byte.MaxValue);

arrayBuilder.Write((byte)c);
}
}
4 changes: 2 additions & 2 deletions HtmlUtilities/AttributeWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ public void Write(ReadOnlySpan<char> name, ulong value)
public void Write(ValidatedAttributeName name, ulong? value)
{
if (value is null)
Write(name, ReadOnlySpan<char>.Empty);
Write(name, []);
else
Write(name, value.GetValueOrDefault());
}
Expand All @@ -236,7 +236,7 @@ public void Write(ValidatedAttributeName name, ulong? value)
public void Write(ReadOnlySpan<char> name, ulong? value)
{
if (value is null)
Write(name, ReadOnlySpan<char>.Empty);
Write(name, []);
else
Write(name, value.GetValueOrDefault());
}
Expand Down
33 changes: 12 additions & 21 deletions HtmlUtilities/CodePoint.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@ namespace HtmlUtilities;
/// Represents a single Unicode code point as described by https://infra.spec.whatwg.org/#code-points.
/// Also provided are several mechanisms to convert to and from <see cref="CodePoint"/> values.
/// </summary>
public readonly struct CodePoint : IEquatable<CodePoint>, IComparable, IComparable<CodePoint>, ISpanFormattable, IFormattable
/// <param name="value">The raw Unicode code point value.</param>
public readonly struct CodePoint(uint value) : IEquatable<CodePoint>, IComparable, IComparable<CodePoint>, ISpanFormattable, IFormattable
{
/// <summary>
/// Gets the raw Unicode code point value.
/// Valid code points are in the range of 0 through 0x10FFFF (1114111 in decimal), but <see cref="CodePoint"/> accepts the full range of <see cref="uint"/>.
/// </summary>
public uint Value { get; }
public uint Value { get; } = value;

/// <summary>
/// Creates a new <see cref="CodePoint"/> with the provided raw Unicode value.
Expand All @@ -24,21 +25,11 @@ public CodePoint(int value) : this((uint)value)
{
}

/// <summary>
/// Creates a new <see cref="CodePoint"/> with the provided raw Unicode value.
/// </summary>
/// <param name="value">The raw Unicode code point value.</param>
public CodePoint(uint value)
{
this.Value = value;
}

// This pre-calculated lookup table provides O(1) lookup time for ASCII characters.
// https://github.com/dotnet/runtime/issues/60948 (via https://github.com/dotnet/roslyn/pull/61414) can potentially make this faster.
// It would also save 512 bytes + overhead of this statically allocated array.
// The current approach was the fastest known option at the time it was written.
private static readonly CodePointInfraCategory[] AsciiInfraCategories = new[]
{
// https://github.com/dotnet/runtime/issues/60948 (via https://github.com/dotnet/roslyn/pull/61414) accelerates it.
// It works by creating a ReadOnlySpan into a compile-time generated constant.
private static ReadOnlySpan<CodePointInfraCategory> AsciiInfraCategories =>
[
ScalarValue | Ascii | C0Control | C0ControlOrSpace | Control,
ScalarValue | Ascii | C0Control | C0ControlOrSpace | Control,
ScalarValue | Ascii | C0Control | C0ControlOrSpace | Control,
Expand Down Expand Up @@ -167,7 +158,7 @@ public CodePoint(uint value)
ScalarValue | Ascii,
ScalarValue | Ascii,
ScalarValue | Ascii | Control,
};
];

/// <summary>
/// Gets the categories of a <see cref="CodePoint"/> as defined by <a href="https://infra.spec.whatwg.org/#code-points">the "infra" standard</a>.
Expand All @@ -180,7 +171,7 @@ public CodePointInfraCategory InfraCategories
{
// Considering that this is an HTML-oriented project, ASCII will be very common so we have a fast path for that.
if (Value < AsciiInfraCategories.Length)
return AsciiInfraCategories[Value];
return AsciiInfraCategories[(int)Value];

return NonAsciiInfraCategory(Value);
}
Expand Down Expand Up @@ -479,13 +470,13 @@ public bool TryFormat(Span<char> destination, out int charsWritten, ReadOnlySpan
public static implicit operator CodePoint(char value) => new(value);

/// <summary>
/// Losslessly converts the <see cref="CodePoint"/> value into an <see cref="int"/>.
/// Converts the <see cref="CodePoint"/> value into an <see cref="int"/>.
/// </summary>
/// <param name="value">The value to convert.</param>
public static implicit operator int(CodePoint value) => (int)value.Value;

/// <summary>
/// Losslessly converts the <see cref="CodePoint"/> value into an <see cref="uint"/>.
/// Converts the <see cref="CodePoint"/> value into an <see cref="uint"/>.
/// </summary>
/// <param name="value">The value to convert.</param>
public static implicit operator uint(CodePoint value) => value.Value;
Expand All @@ -494,7 +485,7 @@ public bool TryFormat(Span<char> destination, out int charsWritten, ReadOnlySpan
/// Converts the <see cref="CodePoint"/> value into an <see cref="char"/>.
/// </summary>
/// <param name="value">The value to convert.</param>
/// <exception cref="OverflowException">The <see cref="Value"/> of the provided <see cref="CodePoint"/> can't be losslessly converted to <see cref="char"/>.</exception>
/// <exception cref="OverflowException">The <see cref="Value"/> of the provided <see cref="CodePoint"/> can't be converted to <see cref="char"/>.</exception>
public static explicit operator char(CodePoint value) => checked((char)value.Value);

/// <summary>
Expand Down
43 changes: 14 additions & 29 deletions HtmlUtilities/HtmlWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,8 @@ namespace HtmlUtilities;
public readonly struct HtmlWriter
{
private static readonly ValidatedElement html = new(
new byte[]
{
(byte)'<', (byte)'!', (byte)'D', (byte)'O', (byte)'C', (byte)'T', (byte)'Y', (byte)'P', (byte)'E', (byte)' ', (byte)'h', (byte)'t', (byte)'m', (byte)'l', (byte)'>',
(byte)'<', (byte)'h', (byte)'t', (byte)'m', (byte)'l', (byte)'>',
},
new byte[]
{
(byte)'<', (byte)'/', (byte)'h', (byte)'t', (byte)'m', (byte)'l', (byte)'>',
});
"<!DOCTYPE html><html>"u8.ToArray(),
"</html>"u8.ToArray());

private readonly IBufferWriter<byte> writer;

Expand Down Expand Up @@ -92,11 +85,11 @@ public void WriteElement(ReadOnlySpan<char> element)
ValidatedElementName.Validate(element, ref elementNameWriter);
var validatedElement = elementNameWriter.WrittenSpan;

w.Write((byte)'<');
w.Write('<');
w.Write(validatedElement);
w.Write(new [] { (byte)'>', (byte)'<', (byte)'/', });
w.Write("></"u8);
w.Write(validatedElement);
w.Write((byte)'>');
w.Write('>');

this.writer.Write(w.WrittenSpan);
}
Expand Down Expand Up @@ -182,7 +175,7 @@ public void WriteElement(ReadOnlySpan<char> name, Action<AttributeWriter>? attri
if (children is not null)
children(this);

writer.Write(new[] { (byte)'<', (byte)'/' });
writer.Write("</"u8);
writer.Write(validatedElement);
}
finally
Expand All @@ -200,11 +193,7 @@ public void WriteElement(ReadOnlySpan<char> name, Action<AttributeWriter>? attri
/// <exception cref="ArgumentException"><paramref name="element"/> was never initialized.</exception>
public void WriteElementSelfClosing(ValidatedElement element)
{
var start = element.start;

if (start is null)
throw new ArgumentException("element was never initialized.", nameof(element));

var start = element.start ?? throw new ArgumentException("element was never initialized.", nameof(element));
this.writer.Write(start);
}

Expand All @@ -219,9 +208,9 @@ public void WriteElementSelfClosing(ReadOnlySpan<char> name)

try
{
w.Write((byte)'<');
w.Write('<');
ValidatedElementName.Validate(name, ref w);
w.Write((byte)'>');
w.Write('>');

this.writer.Write(w.WrittenSpan);
}
Expand All @@ -240,11 +229,7 @@ public void WriteElementSelfClosing(ReadOnlySpan<char> name)
public void WriteElementSelfClosing(ValidatedElement element, Action<AttributeWriter>? attributes = null)
{
var writer = this.writer;
var start = element.start;

if (start is null)
throw new ArgumentException("element was never initialized.", nameof(element));

var start = element.start ?? throw new ArgumentException("element was never initialized.", nameof(element));
if (attributes is null)
writer.Write(start);
else
Expand All @@ -271,7 +256,7 @@ public void WriteElementSelfClosing(ReadOnlySpan<char> name, Action<AttributeWri

try
{
w.Write((byte)'<');
w.Write('<');
ValidatedElementName.Validate(name, ref w);

writer.Write(w.WrittenSpan);
Expand Down Expand Up @@ -331,9 +316,9 @@ public void WriteScript(ValidatedScript script)
if (script.value is null)
throw new ArgumentException("script was never initialized.", nameof(script));

writer.Write(new[] { (byte)'<', (byte)'s', (byte)'c', (byte)'r', (byte)'i', (byte)'p', (byte)'t' });
writer.Write("<script"u8);
writer.Write(script.value);
writer.Write(new[] { (byte)'<', (byte)'/', (byte)'s', (byte)'c', (byte)'r', (byte)'i', (byte)'p', (byte)'t', (byte)'>' });
writer.Write("</script>"u8);
}

/// <summary>
Expand Down Expand Up @@ -443,7 +428,7 @@ static async Task WriteElementAsync(
cancellationToken.ThrowIfCancellationRequested();

var writer = htmlWriterAsync.writer;
writer.Write(new[] { (byte)'<', (byte)'/' });
writer.Write("</"u8);
writer.Write(validatedName.Span);
WriteGreaterThan(writer);
}
Expand Down
16 changes: 0 additions & 16 deletions HtmlUtilities/NamedCharacterReferences.cs

This file was deleted.

5 changes: 1 addition & 4 deletions HtmlUtilities/Validated/ValidatedAttribute.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@ public readonly struct ValidatedAttribute
/// <exception cref="ArgumentException"><paramref name="name"/> was never initialized.</exception>
public ValidatedAttribute(ValidatedAttributeName name, ValidatedAttributeValue value)
{
var nv = name.value;
if (nv is null)
throw new ArgumentException("name was never initialized.", nameof(name));

var nv = name.value ?? throw new ArgumentException("name was never initialized.", nameof(name));
var vv = value.value;
if (vv is null)
{
Expand Down
Loading

0 comments on commit 8d98564

Please sign in to comment.