diff --git a/HtmlUtilities.Tests/CodePointTests.cs b/HtmlUtilities.Tests/CodePointTests.cs index 89702e7..c129c2d 100644 --- a/HtmlUtilities.Tests/CodePointTests.cs +++ b/HtmlUtilities.Tests/CodePointTests.cs @@ -34,15 +34,15 @@ public static void CodePointsHaveCorrectInfraCategories(int codePoint, CodePoint Assert.Equal(categories, new CodePoint(codePoint).InfraCategories); } - public static readonly object?[][] Utf8ValidTestCases = new object?[][] { - new object?[] { null, Array.Empty() }, - new object[] { "$", new CodePoint[] { 0x0024 } }, - new object[] { "£", new CodePoint[] { 0x00A3 } }, - new object[] { "ह", new CodePoint[] { 0x0939 } }, - new object[] { "€", new CodePoint[] { 0x20AC } }, - new object[] { "한", new CodePoint[] { 0xd55c } }, - new object[] { "𐍈", new CodePoint[] { 0x10348 } }, - }; + public static readonly object?[][] Utf8ValidTestCases = [ + [null, Array.Empty()], + ["$", new CodePoint[] { 0x0024 }], + ["£", new CodePoint[] { 0x00A3 }], + ["ह", new CodePoint[] { 0x0939 }], + ["€", new CodePoint[] { 0x20AC }], + ["한", new CodePoint[] { 0xd55c }], + ["𐍈", new CodePoint[] { 0x10348 }], + ]; [Theory] [MemberData(nameof(Utf8ValidTestCases))] @@ -58,20 +58,20 @@ public static void EncodeUtf8FromEnumerableCodePoints(string? expected, CodePoin Assert.Equal(expected is null ? "" : expected, UTF8.GetString(CodePoint.EncodeUtf8(value).ToArray())); } - public static readonly object?[][] Utf16TestCases = new object?[][] { - new object?[] { null, Array.Empty() }, - new object[] { "$", new CodePoint[] { 0x0024 } }, - new object[] { "€", new CodePoint[] { 0x20AC } }, - new object[] { "𐐷", new CodePoint[] { 0x10437 } }, - new object[] { "𤭢", new CodePoint[] { 0x24B62 } }, - }; + public static readonly object?[][] Utf16TestCases = [ + [null, Array.Empty()], + ["$", new CodePoint[] { 0x0024 }], + ["€", new CodePoint[] { 0x20AC }], + ["𐐷", new CodePoint[] { 0x10437 }], + ["𤭢", new CodePoint[] { 0x24B62 }], + ]; - public static readonly object?[][] Utf16TestCasesWithInvalidCodePoints = new object?[][] { - new object[] { "", new CodePoint[] { 0x110000 } }, - new object[] { "$", new CodePoint[] { 0x0024, 0x110000 } }, - new object[] { "$$", new CodePoint[] { 0x0024, 0x110000, 0x0024 } }, - new object[] { "$", new CodePoint[] { 0x110000, 0x0024 } }, - }; + public static readonly object?[][] Utf16TestCasesWithInvalidCodePoints = [ + ["", new CodePoint[] { 0x110000 }], + ["$", new CodePoint[] { 0x0024, 0x110000 }], + ["$$", new CodePoint[] { 0x0024, 0x110000, 0x0024 }], + ["$", new CodePoint[] { 0x110000, 0x0024 }], + ]; [Theory] [MemberData(nameof(Utf16TestCases))] @@ -243,8 +243,8 @@ public static void TrySpanFormattable(string expected, int codePoint) [Fact] public static void TrySpanFormattableTooShortIsFalse() { - Assert.False(new CodePoint('T').TryFormat(Span.Empty, out var charsWritten)); - Assert.False(new CodePoint(0x24B62).TryFormat(Span.Empty, out charsWritten)); + Assert.False(new CodePoint('T').TryFormat([], out var charsWritten)); + Assert.False(new CodePoint(0x24B62).TryFormat([], out charsWritten)); Span destination = stackalloc char[1]; Assert.False(new CodePoint(0x24B62).TryFormat(destination, out charsWritten)); diff --git a/HtmlUtilities.Tests/HtmlWriterAsyncTests.cs b/HtmlUtilities.Tests/HtmlWriterAsyncTests.cs index c48d97a..a78dcdd 100644 --- a/HtmlUtilities.Tests/HtmlWriterAsyncTests.cs +++ b/HtmlUtilities.Tests/HtmlWriterAsyncTests.cs @@ -208,7 +208,7 @@ await HtmlWriter.WriteDocumentAsync(buffer, null, (writer, cancellationToken) => { return writer.WriteElementAsync(new ValidatedElement("body"), children: (writer, cancellationToken) => { - return writer.WriteElementAsync(new ValidatedElement("div", new ValidatedAttribute[] { ("id", "react-app") }), attributes => attributes.Write("class", "root"), cancellationToken: cancellationToken); + return writer.WriteElementAsync(new ValidatedElement("div", [("id", "react-app")]), attributes => attributes.Write("class", "root"), cancellationToken: cancellationToken); }, cancellationToken: cancellationToken); }).ConfigureAwait(false); diff --git a/HtmlUtilities.Tests/HtmlWriterTests.cs b/HtmlUtilities.Tests/HtmlWriterTests.cs index fdb2784..ce52625 100644 --- a/HtmlUtilities.Tests/HtmlWriterTests.cs +++ b/HtmlUtilities.Tests/HtmlWriterTests.cs @@ -213,7 +213,7 @@ public static void WriteDocumentWithMixedAttributes() { writer.WriteElement(new ValidatedElement("body"), children: writer => { - writer.WriteElement(new ValidatedElement("div", new ValidatedAttribute[] { new("id", "react-app")}), attributes => attributes.Write("class", "root")); + writer.WriteElement(new ValidatedElement("div", [new("id", "react-app")]), attributes => attributes.Write("class", "root")); }); }); diff --git a/HtmlUtilities.Tests/Validated/ValidatedElementTests.cs b/HtmlUtilities.Tests/Validated/ValidatedElementTests.cs index 3f5ad94..f3ea99b 100644 --- a/HtmlUtilities.Tests/Validated/ValidatedElementTests.cs +++ b/HtmlUtilities.Tests/Validated/ValidatedElementTests.cs @@ -34,6 +34,6 @@ public static void ValidatedElementIsCorrect() [Fact] public static void ValidatedElementWithAttributesIsCorrect() { - Assert.Equal("", new ValidatedElement("html", new ValidatedAttribute[] { ("lang", "en-us") }).ToString()); + Assert.Equal("", new ValidatedElement("html", [("lang", "en-us")]).ToString()); } } diff --git a/HtmlUtilities/ArrayBuilder.cs b/HtmlUtilities/ArrayBuilder.cs index 0b81550..d3be646 100644 --- a/HtmlUtilities/ArrayBuilder.cs +++ b/HtmlUtilities/ArrayBuilder.cs @@ -8,21 +8,15 @@ namespace HtmlUtilities; /// /// The type of array desired. /// This is not part of the public API because it's easy to misuse and I don't want to help everyone fix their bugs. -internal ref struct ArrayBuilder +internal ref struct ArrayBuilder(int initialCapacity) { - private T[] buffer; - private int written; - - public ArrayBuilder(int initialCapacity) - { - // Accessing ArrayPool.Shared is inlined by the JIT, which then sees a sealed implementation, allowing removal of the vtable lookups. - // The object returned by ArrayPool.Create cannot get this benefit. - // Reference: https://github.com/dotnet/runtime/blob/main/src/libraries/System.Private.CoreLib/src/System/Buffers/ArrayPool.cs - // Using the shared pool carries risk of malfunction if another user returns a rented array but continues to modify it. - // Microsoft uses ArrayPool.Shared extensively within the .NET runtime, so if they're okay with this risk, I suppose I am, too. - buffer = ArrayPool.Shared.Rent(initialCapacity); - written = 0; - } + // Accessing ArrayPool.Shared is inlined by the JIT, which then sees a sealed implementation, allowing removal of the v-table lookups. + // The object returned by ArrayPool.Create cannot get this benefit. + // Reference: https://github.com/dotnet/runtime/blob/main/src/libraries/System.Private.CoreLib/src/System/Buffers/ArrayPool.cs + // Using the shared pool carries risk of malfunction if another user returns a rented array but continues to modify it. + // Microsoft uses ArrayPool.Shared extensively within the .NET runtime, so if they're okay with this risk, I suppose I am, too. + private T[] buffer = ArrayPool.Shared.Rent(initialCapacity); + private int written = 0; private void Grow(int amount) { @@ -59,7 +53,7 @@ public void Write(ReadOnlySpan values) written += length; } - internal T[] Buffer => buffer; + internal readonly T[] Buffer => buffer; internal readonly ReadOnlyMemory WrittenMemory => buffer.AsMemory(0, written); @@ -67,8 +61,16 @@ public void Write(ReadOnlySpan values) public readonly T[] ToArray() => WrittenSpan.ToArray(); - public readonly void Release() + public readonly void Release() => ArrayPool.Shared.Return(buffer); +} + +internal static class ArrayBuilderExtensions +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Write(ref this ArrayBuilder arrayBuilder, char c) { - ArrayPool.Shared.Return(buffer); + System.Diagnostics.Debug.Assert(c <= byte.MaxValue); + + arrayBuilder.Write((byte)c); } } diff --git a/HtmlUtilities/AttributeWriter.cs b/HtmlUtilities/AttributeWriter.cs index 6453142..1ef7be6 100644 --- a/HtmlUtilities/AttributeWriter.cs +++ b/HtmlUtilities/AttributeWriter.cs @@ -222,7 +222,7 @@ public void Write(ReadOnlySpan name, ulong value) public void Write(ValidatedAttributeName name, ulong? value) { if (value is null) - Write(name, ReadOnlySpan.Empty); + Write(name, []); else Write(name, value.GetValueOrDefault()); } @@ -236,7 +236,7 @@ public void Write(ValidatedAttributeName name, ulong? value) public void Write(ReadOnlySpan name, ulong? value) { if (value is null) - Write(name, ReadOnlySpan.Empty); + Write(name, []); else Write(name, value.GetValueOrDefault()); } diff --git a/HtmlUtilities/CodePoint.cs b/HtmlUtilities/CodePoint.cs index 2191f01..0c74ec3 100644 --- a/HtmlUtilities/CodePoint.cs +++ b/HtmlUtilities/CodePoint.cs @@ -8,13 +8,14 @@ namespace HtmlUtilities; /// Represents a single Unicode code point as described by https://infra.spec.whatwg.org/#code-points. /// Also provided are several mechanisms to convert to and from values. /// -public readonly struct CodePoint : IEquatable, IComparable, IComparable, ISpanFormattable, IFormattable +/// The raw Unicode code point value. +public readonly struct CodePoint(uint value) : IEquatable, IComparable, IComparable, ISpanFormattable, IFormattable { /// /// Gets the raw Unicode code point value. /// Valid code points are in the range of 0 through 0x10FFFF (1114111 in decimal), but accepts the full range of . /// - public uint Value { get; } + public uint Value { get; } = value; /// /// Creates a new with the provided raw Unicode value. @@ -24,21 +25,11 @@ public CodePoint(int value) : this((uint)value) { } - /// - /// Creates a new with the provided raw Unicode value. - /// - /// The raw Unicode code point value. - public CodePoint(uint value) - { - this.Value = value; - } - // This pre-calculated lookup table provides O(1) lookup time for ASCII characters. - // https://github.com/dotnet/runtime/issues/60948 (via https://github.com/dotnet/roslyn/pull/61414) can potentially make this faster. - // It would also save 512 bytes + overhead of this statically allocated array. - // The current approach was the fastest known option at the time it was written. - private static readonly CodePointInfraCategory[] AsciiInfraCategories = new[] - { + // https://github.com/dotnet/runtime/issues/60948 (via https://github.com/dotnet/roslyn/pull/61414) accelerates it. + // It works by creating a ReadOnlySpan into a compile-time generated constant. + private static ReadOnlySpan AsciiInfraCategories => + [ ScalarValue | Ascii | C0Control | C0ControlOrSpace | Control, ScalarValue | Ascii | C0Control | C0ControlOrSpace | Control, ScalarValue | Ascii | C0Control | C0ControlOrSpace | Control, @@ -167,7 +158,7 @@ public CodePoint(uint value) ScalarValue | Ascii, ScalarValue | Ascii, ScalarValue | Ascii | Control, - }; + ]; /// /// Gets the categories of a as defined by the "infra" standard. @@ -180,7 +171,7 @@ public CodePointInfraCategory InfraCategories { // Considering that this is an HTML-oriented project, ASCII will be very common so we have a fast path for that. if (Value < AsciiInfraCategories.Length) - return AsciiInfraCategories[Value]; + return AsciiInfraCategories[(int)Value]; return NonAsciiInfraCategory(Value); } @@ -479,13 +470,13 @@ public bool TryFormat(Span destination, out int charsWritten, ReadOnlySpan public static implicit operator CodePoint(char value) => new(value); /// - /// Losslessly converts the value into an . + /// Converts the value into an . /// /// The value to convert. public static implicit operator int(CodePoint value) => (int)value.Value; /// - /// Losslessly converts the value into an . + /// Converts the value into an . /// /// The value to convert. public static implicit operator uint(CodePoint value) => value.Value; @@ -494,7 +485,7 @@ public bool TryFormat(Span destination, out int charsWritten, ReadOnlySpan /// Converts the value into an . /// /// The value to convert. - /// The of the provided can't be losslessly converted to . + /// The of the provided can't be converted to . public static explicit operator char(CodePoint value) => checked((char)value.Value); /// diff --git a/HtmlUtilities/HtmlWriter.cs b/HtmlUtilities/HtmlWriter.cs index bcfaa80..ea246b3 100644 --- a/HtmlUtilities/HtmlWriter.cs +++ b/HtmlUtilities/HtmlWriter.cs @@ -12,15 +12,8 @@ namespace HtmlUtilities; public readonly struct HtmlWriter { private static readonly ValidatedElement html = new( - new byte[] - { - (byte)'<', (byte)'!', (byte)'D', (byte)'O', (byte)'C', (byte)'T', (byte)'Y', (byte)'P', (byte)'E', (byte)' ', (byte)'h', (byte)'t', (byte)'m', (byte)'l', (byte)'>', - (byte)'<', (byte)'h', (byte)'t', (byte)'m', (byte)'l', (byte)'>', - }, - new byte[] - { - (byte)'<', (byte)'/', (byte)'h', (byte)'t', (byte)'m', (byte)'l', (byte)'>', - }); + ""u8.ToArray(), + ""u8.ToArray()); private readonly IBufferWriter writer; @@ -92,11 +85,11 @@ public void WriteElement(ReadOnlySpan element) ValidatedElementName.Validate(element, ref elementNameWriter); var validatedElement = elementNameWriter.WrittenSpan; - w.Write((byte)'<'); + w.Write('<'); w.Write(validatedElement); - w.Write(new [] { (byte)'>', (byte)'<', (byte)'/', }); + w.Write(">'); + w.Write('>'); this.writer.Write(w.WrittenSpan); } @@ -182,7 +175,7 @@ public void WriteElement(ReadOnlySpan name, Action? attri if (children is not null) children(this); - writer.Write(new[] { (byte)'<', (byte)'/' }); + writer.Write(" name, Action? attri /// was never initialized. public void WriteElementSelfClosing(ValidatedElement element) { - var start = element.start; - - if (start is null) - throw new ArgumentException("element was never initialized.", nameof(element)); - + var start = element.start ?? throw new ArgumentException("element was never initialized.", nameof(element)); this.writer.Write(start); } @@ -219,9 +208,9 @@ public void WriteElementSelfClosing(ReadOnlySpan name) try { - w.Write((byte)'<'); + w.Write('<'); ValidatedElementName.Validate(name, ref w); - w.Write((byte)'>'); + w.Write('>'); this.writer.Write(w.WrittenSpan); } @@ -240,11 +229,7 @@ public void WriteElementSelfClosing(ReadOnlySpan name) public void WriteElementSelfClosing(ValidatedElement element, Action? attributes = null) { var writer = this.writer; - var start = element.start; - - if (start is null) - throw new ArgumentException("element was never initialized.", nameof(element)); - + var start = element.start ?? throw new ArgumentException("element was never initialized.", nameof(element)); if (attributes is null) writer.Write(start); else @@ -271,7 +256,7 @@ public void WriteElementSelfClosing(ReadOnlySpan name, Action' }); + writer.Write(""u8); } /// @@ -443,7 +428,7 @@ static async Task WriteElementAsync( cancellationToken.ThrowIfCancellationRequested(); var writer = htmlWriterAsync.writer; - writer.Write(new[] { (byte)'<', (byte)'/' }); + writer.Write(" -/// See https://html.spec.whatwg.org/multipage/named-characters.html for a complete list of options, though only uses a select few. -/// -/// The runtime is optimized so that the const-byte-array-to-readonlyspan pattern doesn't allocate any memory; see https://github.com/dotnet/roslyn/pull/24621. -internal static class NamedCharacterReferences -{ - public static ReadOnlySpan Ampersand => new byte[] { (byte)'&', (byte)'a', (byte)'m', (byte)'p', (byte)';', }; - - public static ReadOnlySpan LessThan => new byte[] { (byte)'&', (byte)'l', (byte)'t', (byte)';', }; - - public static ReadOnlySpan Quote => new[] { (byte)'&', (byte)'q', (byte)'u', (byte)'o', (byte)'t', (byte)';', }; -} diff --git a/HtmlUtilities/Validated/ValidatedAttribute.cs b/HtmlUtilities/Validated/ValidatedAttribute.cs index 89b4001..ec3aab7 100644 --- a/HtmlUtilities/Validated/ValidatedAttribute.cs +++ b/HtmlUtilities/Validated/ValidatedAttribute.cs @@ -17,10 +17,7 @@ public readonly struct ValidatedAttribute /// was never initialized. public ValidatedAttribute(ValidatedAttributeName name, ValidatedAttributeValue value) { - var nv = name.value; - if (nv is null) - throw new ArgumentException("name was never initialized.", nameof(name)); - + var nv = name.value ?? throw new ArgumentException("name was never initialized.", nameof(name)); var vv = value.value; if (vv is null) { diff --git a/HtmlUtilities/Validated/ValidatedAttributeName.cs b/HtmlUtilities/Validated/ValidatedAttributeName.cs index 1131673..fd0b14a 100644 --- a/HtmlUtilities/Validated/ValidatedAttributeName.cs +++ b/HtmlUtilities/Validated/ValidatedAttributeName.cs @@ -33,7 +33,7 @@ internal static void Validate(ReadOnlySpan name, ref ArrayBuilder wr if (name.Length == 0) throw new ArgumentException("name cannot be an empty string.", nameof(name)); - writer.Write((byte)' '); + writer.Write(' '); foreach (var codePoint in CodePoint.GetEnumerable(name)) { var categories = codePoint.InfraCategories; @@ -43,7 +43,7 @@ internal static void Validate(ReadOnlySpan name, ref ArrayBuilder wr switch (codePoint.Value) { case '&': - writer.Write(NamedCharacterReferences.Ampersand); + writer.Write("&"u8); continue; // Specific characters diff --git a/HtmlUtilities/Validated/ValidatedAttributeValue.cs b/HtmlUtilities/Validated/ValidatedAttributeValue.cs index 140c0ad..3492531 100644 --- a/HtmlUtilities/Validated/ValidatedAttributeValue.cs +++ b/HtmlUtilities/Validated/ValidatedAttributeValue.cs @@ -7,7 +7,7 @@ namespace HtmlUtilities.Validated; /// public readonly struct ValidatedAttributeValue { - private static readonly byte[] Empty = new[] { (byte)'=', (byte)'"', (byte)'"' }; + private static readonly byte[] Empty = "=\"\""u8.ToArray(); internal readonly byte[]? value; @@ -260,14 +260,14 @@ internal static void Validate(ReadOnlySpan value, ref ArrayBuilder w private static void EmitUnquoted(ReadOnlySpan value, ref ArrayBuilder writer) { - writer.Write((byte)'='); + writer.Write('='); foreach (var codePoint in CodePoint.GetEnumerable(value)) { switch (codePoint.Value) { case '&': - writer.Write(NamedCharacterReferences.Ampersand); + writer.Write("&"u8); continue; } @@ -277,25 +277,25 @@ private static void EmitUnquoted(ReadOnlySpan value, ref ArrayBuilder value, ref ArrayBuilder writer) { - writer.Write((byte)'='); - writer.Write((byte)'"'); + writer.Write('='); + writer.Write('"'); foreach (var codePoint in CodePoint.GetEnumerable(value)) { switch (codePoint.Value) { case '&': - writer.Write(NamedCharacterReferences.Ampersand); + writer.Write("&"u8); continue; case '"': - writer.Write(NamedCharacterReferences.Quote); + writer.Write("""u8); continue; } codePoint.WriteUtf8To(ref writer); } - writer.Write((byte)'"'); + writer.Write('"'); } /// diff --git a/HtmlUtilities/Validated/ValidatedElement.cs b/HtmlUtilities/Validated/ValidatedElement.cs index cbb8717..0f86b0c 100644 --- a/HtmlUtilities/Validated/ValidatedElement.cs +++ b/HtmlUtilities/Validated/ValidatedElement.cs @@ -54,12 +54,9 @@ internal ValidatedElement(byte[] start, byte[] end) /// was never initialized. public ValidatedElement(ValidatedElementName name, IEnumerable? attributes) { - attributes ??= Enumerable.Empty(); - - var elementNameValue = name.value; - if (elementNameValue is null) - throw new ArgumentException("name was never initialized.", nameof(name)); + attributes ??= []; + var elementNameValue = name.value ?? throw new ArgumentException("name was never initialized.", nameof(name)); var attributeValueLengthSum = 0; foreach (var attribute in attributes) { diff --git a/HtmlUtilities/Validated/ValidatedElementName.cs b/HtmlUtilities/Validated/ValidatedElementName.cs index f468d73..75ca2c7 100644 --- a/HtmlUtilities/Validated/ValidatedElementName.cs +++ b/HtmlUtilities/Validated/ValidatedElementName.cs @@ -33,7 +33,7 @@ internal static void Validate(ReadOnlySpan name, ref ArrayBuilder wr { // https://html.spec.whatwg.org/#syntax-tag-name // Summary of above: - // - Must be at least one charcter + // - Must be at least one character // - First character must be ASCII alpha // - Rest must be ASCII alpha or ASCII digit diff --git a/HtmlUtilities/Validated/ValidatedScript.cs b/HtmlUtilities/Validated/ValidatedScript.cs index f8dcf49..e3e1dfc 100644 --- a/HtmlUtilities/Validated/ValidatedScript.cs +++ b/HtmlUtilities/Validated/ValidatedScript.cs @@ -28,7 +28,7 @@ public static ValidatedScript ForFileSource(ReadOnlySpan source, params Va foreach (var attribute in (attributes ?? Enumerable.Empty()).Prepend(new ValidatedAttribute("src", source))) writer.Write(attribute.value); - writer.Write((byte)'>'); + writer.Write('>'); return new ValidatedScript(writer.ToArray()); } @@ -51,7 +51,7 @@ public static ValidatedScript ForInlineSource(ReadOnlySpan script, params foreach (var attribute in attributes) writer.Write(attribute.value); - writer.Write((byte)'>'); + writer.Write('>'); Validate(ref writer, script); return new ValidatedScript(writer.ToArray()); diff --git a/HtmlUtilities/Validated/ValidatedText.cs b/HtmlUtilities/Validated/ValidatedText.cs index ed0ff27..11a2202 100644 --- a/HtmlUtilities/Validated/ValidatedText.cs +++ b/HtmlUtilities/Validated/ValidatedText.cs @@ -43,10 +43,10 @@ internal static void Validate(ReadOnlySpan text, ref ArrayBuilder wr switch (codePoint.Value) { case '&': - writer.Write(NamedCharacterReferences.Ampersand); + writer.Write("&"u8); continue; case '<': - writer.Write(NamedCharacterReferences.LessThan); + writer.Write("<"u8); continue; }