-
Notifications
You must be signed in to change notification settings - Fork 253
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8a41f7f
commit bf691c2
Showing
3 changed files
with
13,798 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,214 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# 3.3 文本数据" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import torch" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"with open('../../data/chapter3/1342-0.txt', encoding='utf8') as f:\n", | ||
" text = f.read()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'" | ||
] | ||
}, | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"lines = text.split('\\n')\n", | ||
"line = lines[200]\n", | ||
"line" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"torch.Size([70, 128])" | ||
] | ||
}, | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"letter_tensor = torch.zeros(len(line), 128) # 128是由于ASCII的限制\n", | ||
"letter_tensor.shape" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"for i, letter in enumerate(line.lower().strip()):\n", | ||
" # 文本里含有双引号,不是有效的ASCII,因此在此处将其屏蔽\n", | ||
" letter_index = ord(letter) if ord(letter) < 128 else 0\n", | ||
" letter_tensor[i][letter_index] = 1" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',\n", | ||
" ['impossible',\n", | ||
" 'mr',\n", | ||
" 'bennet',\n", | ||
" 'impossible',\n", | ||
" 'when',\n", | ||
" 'i',\n", | ||
" 'am',\n", | ||
" 'not',\n", | ||
" 'acquainted',\n", | ||
" 'with',\n", | ||
" 'him'])" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"def clean_words(input_str):\n", | ||
" punctuation = '.,;:\"!?”“_-'\n", | ||
" word_list = input_str.lower().replace('\\n',' ').split()\n", | ||
" word_list = [word.strip(punctuation) for word in word_list]\n", | ||
" return word_list\n", | ||
"words_in_line = clean_words(line)\n", | ||
"line, words_in_line" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"(7261, 3394)" | ||
] | ||
}, | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"word_list = sorted(set(clean_words(text)))\n", | ||
"word2index_dict = {word: i for (i, word) in enumerate(word_list)}\n", | ||
"len(word2index_dict), word2index_dict['impossible']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" 0 3394 impossible\n", | ||
" 1 4305 mr\n", | ||
" 2 813 bennet\n", | ||
" 3 3394 impossible\n", | ||
" 4 7078 when\n", | ||
" 5 3315 i\n", | ||
" 6 415 am\n", | ||
" 7 4436 not\n", | ||
" 8 239 acquainted\n", | ||
" 9 7148 with\n", | ||
"10 3215 him\n", | ||
"torch.Size([11, 7261])\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"word_tensor = torch.zeros(len(words_in_line), len(word2index_dict))\n", | ||
"for i, word in enumerate(words_in_line):\n", | ||
" word_index = word2index_dict[word]\n", | ||
" word_tensor[i][word_index] = 1\n", | ||
" print('{:2} {:4} {}'.format(i, word_index, word))\n", | ||
"\n", | ||
"print(word_tensor.shape)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.