Skip to content

Commit

Permalink
add 3.3
Browse files Browse the repository at this point in the history
  • Loading branch information
ShusenTang committed Dec 21, 2019
1 parent 8a41f7f commit bf691c2
Show file tree
Hide file tree
Showing 3 changed files with 13,798 additions and 0 deletions.
214 changes: 214 additions & 0 deletions code/chapter3/3.3.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3.3 文本数据"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import torch"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with open('../../data/chapter3/1342-0.txt', encoding='utf8') as f:\n",
" text = f.read()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = text.split('\\n')\n",
"line = lines[200]\n",
"line"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([70, 128])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"letter_tensor = torch.zeros(len(line), 128) # 128是由于ASCII的限制\n",
"letter_tensor.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for i, letter in enumerate(line.lower().strip()):\n",
" # 文本里含有双引号,不是有效的ASCII,因此在此处将其屏蔽\n",
" letter_index = ord(letter) if ord(letter) < 128 else 0\n",
" letter_tensor[i][letter_index] = 1"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',\n",
" ['impossible',\n",
" 'mr',\n",
" 'bennet',\n",
" 'impossible',\n",
" 'when',\n",
" 'i',\n",
" 'am',\n",
" 'not',\n",
" 'acquainted',\n",
" 'with',\n",
" 'him'])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def clean_words(input_str):\n",
" punctuation = '.,;:\"!?”“_-'\n",
" word_list = input_str.lower().replace('\\n',' ').split()\n",
" word_list = [word.strip(punctuation) for word in word_list]\n",
" return word_list\n",
"words_in_line = clean_words(line)\n",
"line, words_in_line"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7261, 3394)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_list = sorted(set(clean_words(text)))\n",
"word2index_dict = {word: i for (i, word) in enumerate(word_list)}\n",
"len(word2index_dict), word2index_dict['impossible']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0 3394 impossible\n",
" 1 4305 mr\n",
" 2 813 bennet\n",
" 3 3394 impossible\n",
" 4 7078 when\n",
" 5 3315 i\n",
" 6 415 am\n",
" 7 4436 not\n",
" 8 239 acquainted\n",
" 9 7148 with\n",
"10 3215 him\n",
"torch.Size([11, 7261])\n"
]
}
],
"source": [
"word_tensor = torch.zeros(len(words_in_line), len(word2index_dict))\n",
"for i, word in enumerate(words_in_line):\n",
" word_index = word2index_dict[word]\n",
" word_tensor[i][word_index] = 1\n",
" print('{:2} {:4} {}'.format(i, word_index, word))\n",
"\n",
"print(word_tensor.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit bf691c2

Please sign in to comment.