diff --git a/CHANGELOG.md b/CHANGELOG.md index a32f1af..6b829bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## [4.4.0](https://github.com/omrilotan/isbot/compare/v4.3.0...v4.4.0) + +- Add a naive fallback pattern for engines that do not support lookbehind in regular expressions +- Add isbotNaive function to identify bots using a naive approach (simpler and faster) + ## [4.3.0](https://github.com/omrilotan/isbot/compare/v4.2.0...v4.3.0) - Accept `undefined` in place of user agent string to allow headers property to be used "as is" (`request.headers["user-agent"]`) diff --git a/README.md b/README.md index cb71842..f6379a0 100644 --- a/README.md +++ b/README.md @@ -39,10 +39,18 @@ Using JSDeliver CDN you can import an iife script // isbot is global isbot(navigator.userAgent) ``` -## Additional named imports +## How `isbot` maintains accuracy + +> `isbot`'s prized possession is the accurate identification of bots using a regular expression. It uses expansive and regularly updated lists of user agent strings to create a regular expression that matches bots and only bots. +> +> This is done by using a lookbehind pattern which is not supported in all environments. A fallback is provided for environments that do not support lookbehind which is less accurate. The test suite includes a percentage of false positives and false negatives which is deemed acceptable for the fallback: 1% false positive and 75% bot coverage. + +## All named imports | import | Type | Description | | ------------------- | ------------------------------------------------- | ---------------------------------------------------------------------------- | +| isbot | _(userAgent: string): boolean_ | Check if the user agent is a bot | +| isbotNaive | _(userAgent: string): boolean_ | Check if the user agent is a bot using a naive pattern (less accurate) | | pattern | _RegExp_ | The regular expression used to identify bots | | list | _string[]_ | List of all individual pattern parts | | isbotMatch | _(userAgent: string): string \| null_ | The substring matched by the regular expression | diff --git a/package.json b/package.json index 29da451..297d867 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "isbot", - "version": "4.3.0", + "version": "4.4.0", "description": "🤖 Recognise bots/crawlers/spiders using the user agent string.", "keywords": [ "bot", @@ -44,13 +44,14 @@ "default": "./index.js" } }, + "sideEffects": false, "types": "index.d.ts", "scripts": { "prepare": "./scripts/prepare/index.js", "build": "./scripts/build/procedure.sh", "format": "./scripts/format/procedure.sh", "pretest": "npm run build && npm run prepare", - "test": "node --expose-gc node_modules/.bin/jest --verbose", + "test": "./scripts/test/procedure.sh", "prepublishOnly": "./scripts/prepublish/procedure.sh", "prestart": "which parcel || npm i parcel-bundler --no-save", "start": "parcel page/index.pug --out-dir docs", diff --git a/scripts/build/pattern.js b/scripts/build/pattern.js index 8bc2a69..2f9e5d4 100755 --- a/scripts/build/pattern.js +++ b/scripts/build/pattern.js @@ -3,10 +3,16 @@ import { writeFile } from "node:fs/promises"; import patterns from "../../src/patterns.json" assert { type: "json" }; -const pattern = new RegExp(patterns.join("|"), "i").toString(); -const code = ` -export const regex: RegExp = ${pattern}; -export const parts: number = ${patterns.length}; -export const size: number = ${pattern.length}; -`.trim(); +const pattern = new RegExp( + patterns + .map((pattern) => pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")) + .join("|"), +).source; + +const expression = new RegExp(patterns.join("|"), "i").toString(); + +const code = [ + `export const fullPattern: string = "${pattern}";`, + `export const regularExpression: RegExp = ${expression};`, +].join("\n"); await writeFile("src/pattern.ts", code); diff --git a/scripts/test/procedure.sh b/scripts/test/procedure.sh new file mode 100755 index 0000000..7a9cefb --- /dev/null +++ b/scripts/test/procedure.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +failures=0 + +node --expose-gc node_modules/.bin/jest --verbose $@ +failures=$((failures + $?)) + +echo $(which es-check) +if [[ -z $(which es-check) ]]; then + echo "es-check not found. install locally." + npm install es-check --no-save + failures=$((failures + $?)) +fi + +es-check es2015 index.iife.js +failures=$((failures + $?)) + +echo -e "→ Number of failures: ${failures}" +exit $failures diff --git a/src/index.ts b/src/index.ts index 7a14074..2dff947 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,23 +1,44 @@ -import { regex } from "./pattern"; +import { fullPattern, regularExpression } from "./pattern"; import patternsList from "./patterns.json"; +/** + * Naive bot pattern. + */ +const naivePattern = /bot|spider|crawl|http|lighthouse/i; + // Workaround for TypeScript's type definition of imported variables and JSON files. /** * A pattern that matches bot identifiers in user agent strings. */ -export const pattern: RegExp = regex; +export const pattern = regularExpression; /** * A list of bot identifiers to be used in a regular expression against user agent strings. */ export const list: string[] = patternsList; +/** + * Check if the given user agent includes a bot pattern. Naive implementation (less accurate). + */ +export const isbotNaive = (userAgent?: string | null): boolean => + Boolean(userAgent) && naivePattern.test(userAgent); + +let usedPattern: RegExp; /** * Check if the given user agent includes a bot pattern. */ -export const isbot = (userAgent?: string | null): boolean => - Boolean(userAgent) && pattern.test(userAgent); +export function isbot(userAgent?: string | null): boolean { + if (typeof usedPattern === "undefined") { + try { + // Build this RegExp dynamically to avoid syntax errors in older engines. + usedPattern = new RegExp(fullPattern, "i"); + } catch (error) { + usedPattern = naivePattern; + } + } + return Boolean(userAgent) && usedPattern.test(userAgent); +} /** * Create a custom isbot function with a custom pattern. diff --git a/src/patterns.json b/src/patterns.json index eb8621d..6cd21ad 100644 --- a/src/patterns.json +++ b/src/patterns.json @@ -11,7 +11,6 @@ "(? { describe("features", () => { test("pattern: pattern is a regex", () => { @@ -79,6 +92,65 @@ describe("isbot", () => { ); }); + describe("isbotNaive", () => { + test.each([75])( + "a large number of user agent strings can be detected (>%s%)", + (percent) => { + const ratio = + crawlers.filter((ua) => isbotNaive(ua)).length / crawlers.length; + expect(ratio).toBeLessThan(1); + expect(ratio).toBeGreaterThan(percent / 100); + }, + ); + test.each([1])( + "a small number of browsers is falsly detected as bots (<%s%)", + (percent) => { + const ratio = + browsers.filter((ua) => isbotNaive(ua)).length / browsers.length; + expect(ratio).toBeGreaterThan(0); + expect(ratio).toBeLessThan(percent / 100); + }, + ); + }); + + describe("regex fallback", () => { + beforeAll(async () => { + jest + .spyOn(globalThis, "RegExp") + .mockImplementation((pattern, flags): RegExp => { + if ((pattern as string).includes?.("?; + }); + afterAll(() => { + jest.restoreAllMocks(); + }); + test("fallback regex detects commong crawlers", () => { + USER_AGENT_COMMON.forEach((ua) => { + if (!isbotInstance(ua)) { + throw new Error(`Failed to detect ${ua} as bot`); + } + }); + }); + test("fallback detects gotchas as bots", () => { + USER_AGENT_GOTCHAS.forEach((ua) => { + if (!isbotInstance(ua)) { + throw new Error(`Failed to detect ${ua} as bot (gotcha)`); + } + }); + }); + test("fallback does not detect browser as bot", () => { + expect(isbotInstance(BROWSER_USER_AGENT_EXAMPLE)).toBe(false); + }); + }); + describe("fixtures", () => { test(`✔︎ ${crawlers.length} user agent string should be recognised as crawler`, () => { let successCount = 0; @@ -107,4 +179,17 @@ describe("isbot", () => { expect(successCount).toBe(browsers.length); }); }); + + describe("module interface", () => { + test("interface is as expected", async () => { + const types = Object.entries(await import("../../src/index")).map( + ([key, value]) => [key, value.constructor.name] as [string, string], + ); + expect(types).toMatchSnapshot(); + }); + test("regular expressions exports are as expected", () => { + expect(pattern).toBe(regularExpression); + expect(new RegExp(fullPattern, "i").toString()).toBe(pattern.toString()); + }); + }); });