From 21becb8a1e4374520397273676888597fc8920b7 Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Sun, 3 Jul 2016 16:56:47 +0200 Subject: [PATCH] Adding the statcan & lein phonetic algorithms --- CHANGELOG.md | 2 ++ src/phonetics/lein.js | 61 ++++++++++++++++++++++++++++++++++++ src/phonetics/statcan.js | 50 ++++++++++++++++++++++++++++++ test/endpoint.js | 2 ++ test/phonetics/lein.js | 65 +++++++++++++++++++++++++++++++++++++++ test/phonetics/statcan.js | 36 ++++++++++++++++++++++ 6 files changed, 216 insertions(+) create mode 100644 src/phonetics/lein.js create mode 100644 src/phonetics/statcan.js create mode 100644 test/phonetics/lein.js create mode 100644 test/phonetics/statcan.js diff --git a/CHANGELOG.md b/CHANGELOG.md index 94eeae0..ce15ea5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,8 @@ ## 0.7.0 * Adding the `features/extraction/vectorizer` namespace. +* Adding the `phonetics/lein` namespace. +* Adding the `phonetics/statcan` namespace. ## 0.6.0 diff --git a/src/phonetics/lein.js b/src/phonetics/lein.js new file mode 100644 index 0000000..bf0d266 --- /dev/null +++ b/src/phonetics/lein.js @@ -0,0 +1,61 @@ +/** + * Talisman phonetics/lein + * ======================== + * + * The Lein name coding procedure. + * + * [Reference]: + * http://naldc.nal.usda.gov/download/27833/PDF + */ +import deburr from 'lodash/deburr'; +import {squeeze, translation} from '../helpers'; + +/** + * Constants. + */ +const DROPPED = /[AEIOUYWH]/g; + +const TRANSLATION = translation('DTMNLRBFPVCJKGQSXZ', '112233444455555555'); + +/** + * Helpers. + */ +function pad(code) { + return (code + '0000').slice(0, 4); +} + +/** + * Function taking a single name and computing its lein code. + * + * @param {string} name - The name to process. + * @return {string} - The lein code. + * + * @throws {Error} The function expects the name to be a string. + */ +export default function lein(name) { + if (typeof name !== 'string') + throw Error('talisman/phonetics/lein: the given name is not a string.'); + + let code = deburr(name) + .toUpperCase() + .replace(/[^A-Z\s]/g, ''); + + // 1-- Keeping the first letter + const first = code[0]; + code = code.slice(1); + + // 2-- Dropping vowels and Y, W & H + code = code.replace(DROPPED, ''); + + // 3-- Dropping consecutive duplicates and truncating to 4 characters + code = squeeze(code).slice(0, 4); + + // 4-- Translations + const backup = code; + code = ''; + + for (let i = 0, l = backup.length; i < l; i++) + code += TRANSLATION[backup[i]] || backup[i]; + + return pad(first + code); +} diff --git a/src/phonetics/statcan.js b/src/phonetics/statcan.js new file mode 100644 index 0000000..c2c7883 --- /dev/null +++ b/src/phonetics/statcan.js @@ -0,0 +1,50 @@ +/** + * Talisman phonetics/statcan + * =========================== + * + * The statistics Canada name coding technique. + * + * [Reference]: + * http://naldc.nal.usda.gov/download/27833/PDF + */ +import deburr from 'lodash/deburr'; +import {squeeze} from '../helpers'; + +/** + * Constants. + */ +const DROPPED = /[AEIOUY]/g; + +/** + * Function taking a single name and computing its statcan code. + * + * @param {string} name - The name to process. + * @return {string} - The statcan code. + * + * @throws {Error} The function expects the name to be a string. + */ +export default function statcan(name) { + + if (typeof name !== 'string') + throw Error('talisman/phonetics/statcan: the given name is not a string.'); + + let code = deburr(name) + .toUpperCase() + .replace(/[^A-Z\s]/g, ''); + + // 1-- Keeping the first letter + const first = code[0]; + code = code.slice(1); + + // 2-- Dropping vowels and Y + code = code.replace(DROPPED, ''); + + // 3-- Dropping consecutive duplicates + code = squeeze(code); + + // 4-- Dropping blanks + code = code.replace(/\s/g, ''); + + // 5-- Limiting code size to 4 + return (first + code).slice(0, 4); +} diff --git a/test/endpoint.js b/test/endpoint.js index 7cd2651..0870f72 100644 --- a/test/endpoint.js +++ b/test/endpoint.js @@ -46,10 +46,12 @@ describe('phonetics', function() { require('./phonetics/caverphone.js'); require('./phonetics/daitch-mokotoff.js'); require('./phonetics/double-metaphone.js'); + require('./phonetics/lein.js'); require('./phonetics/metaphone.js'); require('./phonetics/mra.js'); require('./phonetics/nysiis.js'); require('./phonetics/soundex.js'); + require('./phonetics/statcan.js'); describe('french', function() { require('./phonetics/french/phonetic.js'); diff --git a/test/phonetics/lein.js b/test/phonetics/lein.js new file mode 100644 index 0000000..04ea486 --- /dev/null +++ b/test/phonetics/lein.js @@ -0,0 +1,65 @@ +/** + * Talisman phonetics/lein tests + * ============================== + * + */ +import assert from 'assert'; +import lein from '../../src/phonetics/lein'; + +describe('lein', function() { + + it('should throw if the given word is not a string.', function() { + assert.throws(function() { + lein([]); + }, /string/); + }); + + it('should compute the lein code correctly.', function() { + const tests = [ + ['Guillaume', 'G320'], + ['Dabbs', 'D450'], + ['Daves', 'D450'], + ['Davies', 'D450'], + ['Davis', 'D450'], + ['Debaca', 'D450'], + ['Debose', 'D450'], + ['Debus', 'D450'], + ['Defazio', 'D450'], + ['Defigh', 'D450'], + ['Deveaux', 'D450'], + ['Devese', 'D450'], + ['Devies', 'D450'], + ['Devos', 'D450'], + ['Dipiazza', 'D450'], + ['Divish', 'D450'], + ['Dobak', 'D450'], + ['Dobbs', 'D450'], + ['Dobis', 'D450'], + ['Dobish', 'D450'], + ['Dobosh', 'D450'], + ['Doepke', 'D450'], + ['Dopps', 'D450'], + ['Doubek', 'D450'], + ['Doviak', 'D450'], + ['Dubbs', 'D450'], + ['Dubke', 'D450'], + ['Dubois', 'D450'], + ['Duboise', 'D450'], + ['Dubose', 'D450'], + ['Dubs', 'D450'], + ['Dubukey', 'D450'], + ['Dubus', 'D450'], + ['Dufek', 'D450'], + ['Duffek', 'D450'], + ['Dupas', 'D450'], + ['Dupois', 'D450'], + ['Dupuis', 'D450'], + ['Arlène', 'A332'], + ['Lüdenscheidt', 'L125'] + ]; + + tests.forEach(function([word, code]) { + assert.strictEqual(lein(word), code, `${word} => ${code}`); + }); + }); +}); diff --git a/test/phonetics/statcan.js b/test/phonetics/statcan.js new file mode 100644 index 0000000..d97178c --- /dev/null +++ b/test/phonetics/statcan.js @@ -0,0 +1,36 @@ +/** + * Talisman phonetics/statcan tests + * ================================= + * + */ +import assert from 'assert'; +import statcan from '../../src/phonetics/statcan'; + +describe('statcan', function() { + + it('should throw if the given word is not a string.', function() { + assert.throws(function() { + statcan([]); + }, /string/); + }); + + it('should compute the statcan code correctly.', function() { + const tests = [ + ['Guillaume', 'GLM'], + ['Daves', 'DVS'], + ['Davies', 'DVS'], + ['Davis', 'DVS'], + ['Devese', 'DVS'], + ['Devies', 'DVS'], + ['Devos', 'DVS'], + ['Dove', 'DV'], + ['Divish', 'DVSH'], + ['Arlène', 'ARLN'], + ['Lüdenscheidt', 'LDNS'] + ]; + + tests.forEach(function([word, code]) { + assert.strictEqual(statcan(word), code, `${word} => ${code}`); + }); + }); +});