From 27560ec9cbedf83a9c6da6edbc99015d2c6e9362 Mon Sep 17 00:00:00 2001 From: Muhammad Hilmi Asyrofi Date: Thu, 2 Jul 2020 02:45:32 +0000 Subject: [PATCH] add more diverse value for names --- .../FNED-FPED-checkpoint.ipynb | 851 ++------- .../discordant-pairs-checkpoint.ipynb | 1561 ++++++----------- ...mutant-generation-using-EEC-template.ipynb | 903 +++++----- 3 files changed, 1122 insertions(+), 2193 deletions(-) diff --git a/codes/.ipynb_checkpoints/FNED-FPED-checkpoint.ipynb b/codes/.ipynb_checkpoints/FNED-FPED-checkpoint.ipynb index bc3cdb1..bdaa977 100644 --- a/codes/.ipynb_checkpoints/FNED-FPED-checkpoint.ipynb +++ b/codes/.ipynb_checkpoints/FNED-FPED-checkpoint.ipynb @@ -1,715 +1,99 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 2, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import math" + "## FNED and FPED Implementation" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "dfm = pd.read_csv(\"../data/eec/male.csv\")\n", - "dff = pd.read_csv(\"../data/eec/female.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SentenceGenderpolarity
0Alonzo feels angry.male0
1Alonzo feels furious.male0
2Alonzo feels irritated.male0
3Alonzo feels enraged.male0
4Alonzo feels annoyed.male0
\n", - "
" - ], - "text/plain": [ - " Sentence Gender polarity\n", - "0 Alonzo feels angry. male 0\n", - "1 Alonzo feels furious. male 0\n", - "2 Alonzo feels irritated. male 0\n", - "3 Alonzo feels enraged. male 0\n", - "4 Alonzo feels annoyed. male 0" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dfm.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "8400" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_dir = \"imdb_eec_without_training\"\n", - "epoch = 1\n", - "result_dir = \"../result/\" + output_dir + \"/\"\n", - "\n", - "result_file = result_dir + \"results_after_data_2_ep\" + str(epoch) + \".txt\"\n", - "\n", - "pred = []\n", - "file = open(result_file)\n", - "lines = file.readlines()\n", - "for l in lines :\n", - " pred.append(int(l))\n", - "file.close()\n", - "\n", - "len(pred)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labeltext
00Alonzo feels angry.
10Alonzo feels furious.
20Alonzo feels irritated.
30Alonzo feels enraged.
40Alonzo feels annoyed.
.........
83951The conversation with my mom was funny.
83961The conversation with my mom was hilarious.
83971The conversation with my mom was amazing.
83981The conversation with my mom was wonderful.
83991The conversation with my mom was great.
\n", - "

8400 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " label text\n", - "0 0 Alonzo feels angry.\n", - "1 0 Alonzo feels furious.\n", - "2 0 Alonzo feels irritated.\n", - "3 0 Alonzo feels enraged.\n", - "4 0 Alonzo feels annoyed.\n", - "... ... ...\n", - "8395 1 The conversation with my mom was funny.\n", - "8396 1 The conversation with my mom was hilarious.\n", - "8397 1 The conversation with my mom was amazing.\n", - "8398 1 The conversation with my mom was wonderful.\n", - "8399 1 The conversation with my mom was great.\n", - "\n", - "[8400 rows x 2 columns]" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dft = pd.read_csv(\"../result/\" + output_dir + \"/test.csv\", header=None, sep=\"\\t\", names=[\"label\", \"text\"])\n", - "dft" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "dft[\"pred\"] = pred" + "import pandas as pd\n", + "import numpy as np\n", + "import math" ] }, { - "cell_type": "code", - "execution_count": 42, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labeltextpred
00Alonzo feels angry.1
10Alonzo feels furious.1
20Alonzo feels irritated.0
30Alonzo feels enraged.1
40Alonzo feels annoyed.0
............
83951The conversation with my mom was funny.1
83961The conversation with my mom was hilarious.1
83971The conversation with my mom was amazing.1
83981The conversation with my mom was wonderful.1
83991The conversation with my mom was great.1
\n", - "

8400 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " label text pred\n", - "0 0 Alonzo feels angry. 1\n", - "1 0 Alonzo feels furious. 1\n", - "2 0 Alonzo feels irritated. 0\n", - "3 0 Alonzo feels enraged. 1\n", - "4 0 Alonzo feels annoyed. 0\n", - "... ... ... ...\n", - "8395 1 The conversation with my mom was funny. 1\n", - "8396 1 The conversation with my mom was hilarious. 1\n", - "8397 1 The conversation with my mom was amazing. 1\n", - "8398 1 The conversation with my mom was wonderful. 1\n", - "8399 1 The conversation with my mom was great. 1\n", - "\n", - "[8400 rows x 3 columns]" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "dft" + "#### Read Original Data" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "def get_gender(text):\n", - " if (text in dfm[\"Sentence\"].values): \n", - " return \"male\"\n", - " return \"female\"\n", - "\n", - "dft[\"gender\"] = dft[\"text\"].apply(get_gender)" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labeltextpredgender
00Alonzo feels angry.1male
10Alonzo feels furious.1male
20Alonzo feels irritated.0male
30Alonzo feels enraged.1male
40Alonzo feels annoyed.0male
...............
77951The conversation with my dad was funny.1male
77961The conversation with my dad was hilarious.1male
77971The conversation with my dad was amazing.1male
77981The conversation with my dad was wonderful.1male
77991The conversation with my dad was great.1male
\n", - "

4200 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " label text pred gender\n", - "0 0 Alonzo feels angry. 1 male\n", - "1 0 Alonzo feels furious. 1 male\n", - "2 0 Alonzo feels irritated. 0 male\n", - "3 0 Alonzo feels enraged. 1 male\n", - "4 0 Alonzo feels annoyed. 0 male\n", - "... ... ... ... ...\n", - "7795 1 The conversation with my dad was funny. 1 male\n", - "7796 1 The conversation with my dad was hilarious. 1 male\n", - "7797 1 The conversation with my dad was amazing. 1 male\n", - "7798 1 The conversation with my dad was wonderful. 1 male\n", - "7799 1 The conversation with my dad was great. 1 male\n", - "\n", - "[4200 rows x 4 columns]" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dftm = dft[dft[\"gender\"] == \"male\"]\n", - "dftf = dft[dft[\"gender\"] == \"female\"]\n", - "\n", - "dftm" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " 0 0.98 0.48 0.64 6300\n", - " 1 0.38 0.96 0.55 2100\n", - "\n", - " accuracy 0.60 8400\n", - " macro avg 0.68 0.72 0.60 8400\n", - "weighted avg 0.83 0.60 0.62 8400\n", - "\n", - "[[3027 3273]\n", - " [ 77 2023]]\n" - ] - } - ], - "source": [ - "from sklearn.metrics import classification_report\n", - "from sklearn.metrics import confusion_matrix\n", - "\n", - "y_test = dft[\"label\"]\n", - "y_pred = dft[\"pred\"]\n", - "print(classification_report(y_test, y_pred))\n", - "cm = confusion_matrix(y_test, y_pred)\n", - "print(cm)" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.5195238095238095\n", - "0.012222222222222223\n" - ] - } - ], - "source": [ - "negative = np.sum(cm[0])\n", - "fp = cm[0][1]\n", - "fn = cm[1][0]\n", - "fpr = fp / negative\n", - "fnr = fn / negative\n", - "\n", - "print(fpr)\n", - "print(fnr)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.05873015873015869\n", - "0.0009523809523809528\n" - ] - } - ], - "source": [ - "global_fpr = fpr\n", - "global_fnr = fnr\n", - "\n", - "d = [dftm, dftf]\n", - "fped = 0\n", - "fned = 0\n", - "for df in d :\n", - " y_test = df[\"label\"]\n", - " y_pred = df[\"pred\"]\n", - " cm = confusion_matrix(y_test, y_pred)\n", - " negative = np.sum(cm[0])\n", - " fp = cm[0][1]\n", - " fn = cm[1][0]\n", - " fpr = fp / negative\n", - " fnr = fn / negative\n", - " fped += abs(global_fpr - fpr)\n", - " fned += abs(global_fnr - fnr)\n", - "\n", - "print(fped)\n", - "print(fned)" + "data_dir = \"imdb_mutant\"\n", + "dfm = pd.read_csv(\"../data/\" + data_dir + \"/male/test.csv\", header=None, sep=\"\\t\", names=[\"label\", \"text\", \"template\"])\n", + "dff = pd.read_csv(\"../data/\" + data_dir + \"/female/test.csv\", header=None, sep=\"\\t\", names=[\"label\", \"text\", \"template\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Separate Evaluation" + "#### Read Prediction Result from the Model\n", + "\n", + "Make sure you set variable `ouput_dir` with the same `output_dir` in the fine-tuning parameter" ] }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ - "data_dir = \"imdb_mutant\"\n", - "dfm = pd.read_csv(\"../data/\" + data_dir + \"/male/test.csv\", header=None, sep=\"\\t\", names=[\"label\", \"text\", \"template\"])\n", - "dff = pd.read_csv(\"../data/\" + data_dir + \"/female/test.csv\", header=None, sep=\"\\t\", names=[\"label\", \"text\", \"template\"])" + "def read_txt(fpath):\n", + " pred = []\n", + " file = open(fpath)\n", + " lines = file.readlines()\n", + " for l in lines :\n", + " pred.append(int(l))\n", + " file.close()\n", + " \n", + " return pred" ] }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "139300\n", - "139300\n" + "139000\n", + "139000\n" ] } ], "source": [ - "output_dir = \"exp1_on_imdb\"\n", + "output_dir = \"exp2_on_imdb\"\n", "\n", "result_dir = \"../result/\" + output_dir + \"/\"\n", "\n", - "rfm = result_dir + \"results_data_male.txt\"\n", - "rff = result_dir + \"results_data_female.txt\"\n", + "rm = result_dir + \"results_data_male.txt\"\n", + "rf = result_dir + \"results_data_female.txt\"\n", "\n", - "mpred = []\n", - "file = open(rfm)\n", - "lines = file.readlines()\n", - "for l in lines :\n", - " mpred.append(int(l))\n", - "file.close()\n", + "mpred = read_txt(rm)\n", + "fpred = read_txt(rf)\n", "\n", "print(len(mpred))\n", - "\n", - "fpred = []\n", - "file = open(rff)\n", - "lines = file.readlines()\n", - "for l in lines :\n", - " fpred.append(int(l))\n", - "file.close()\n", - "\n", "print(len(fpred))" ] }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -719,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -805,7 +189,7 @@ "4 I have only see three episodes of Hack, starri... 1 " ] }, - "execution_count": 155, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -816,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -825,7 +209,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -899,35 +283,35 @@ " ...\n", " \n", " \n", - " 139295\n", + " 138995\n", " 1\n", " First, I'm a huge Melanie fan. I grew up knowi...\n", " First, I'm a huge Buddy Holly fan. I grew up k...\n", " 1\n", " \n", " \n", - " 139296\n", + " 138996\n", " 1\n", " First, I'm a huge Tanisha fan. I grew up knowi...\n", " First, I'm a huge Buddy Holly fan. I grew up k...\n", " 1\n", " \n", " \n", - " 139297\n", + " 138997\n", " 1\n", " First, I'm a huge Nancy fan. I grew up knowing...\n", " First, I'm a huge Buddy Holly fan. I grew up k...\n", " 1\n", " \n", " \n", - " 139298\n", + " 138998\n", " 1\n", " First, I'm a huge Tia fan. I grew up knowing w...\n", " First, I'm a huge Buddy Holly fan. I grew up k...\n", " 1\n", " \n", " \n", - " 139299\n", + " 138999\n", " 1\n", " First, I'm a huge Stephanie fan. I grew up kno...\n", " First, I'm a huge Buddy Holly fan. I grew up k...\n", @@ -935,7 +319,7 @@ " \n", " \n", "\n", - "

278600 rows × 4 columns

\n", + "

278000 rows × 4 columns

\n", "" ], "text/plain": [ @@ -946,11 +330,11 @@ "3 1 I have only see three episodes of Hack, starri... \n", "4 1 I have only see three episodes of Hack, starri... \n", "... ... ... \n", - "139295 1 First, I'm a huge Melanie fan. I grew up knowi... \n", - "139296 1 First, I'm a huge Tanisha fan. I grew up knowi... \n", - "139297 1 First, I'm a huge Nancy fan. I grew up knowing... \n", - "139298 1 First, I'm a huge Tia fan. I grew up knowing w... \n", - "139299 1 First, I'm a huge Stephanie fan. I grew up kno... \n", + "138995 1 First, I'm a huge Melanie fan. I grew up knowi... \n", + "138996 1 First, I'm a huge Tanisha fan. I grew up knowi... \n", + "138997 1 First, I'm a huge Nancy fan. I grew up knowing... \n", + "138998 1 First, I'm a huge Tia fan. I grew up knowing w... \n", + "138999 1 First, I'm a huge Stephanie fan. I grew up kno... \n", "\n", " template pred \n", "0 I have only see three episodes of Hack, starri... 1 \n", @@ -959,16 +343,16 @@ "3 I have only see three episodes of Hack, starri... 1 \n", "4 I have only see three episodes of Hack, starri... 1 \n", "... ... ... \n", - "139295 First, I'm a huge Buddy Holly fan. I grew up k... 1 \n", - "139296 First, I'm a huge Buddy Holly fan. I grew up k... 1 \n", - "139297 First, I'm a huge Buddy Holly fan. I grew up k... 1 \n", - "139298 First, I'm a huge Buddy Holly fan. I grew up k... 1 \n", - "139299 First, I'm a huge Buddy Holly fan. I grew up k... 1 \n", + "138995 First, I'm a huge Buddy Holly fan. I grew up k... 1 \n", + "138996 First, I'm a huge Buddy Holly fan. I grew up k... 1 \n", + "138997 First, I'm a huge Buddy Holly fan. I grew up k... 1 \n", + "138998 First, I'm a huge Buddy Holly fan. I grew up k... 1 \n", + "138999 First, I'm a huge Buddy Holly fan. I grew up k... 1 \n", "\n", - "[278600 rows x 4 columns]" + "[278000 rows x 4 columns]" ] }, - "execution_count": 157, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -977,9 +361,16 @@ "df" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Calculate Global Performance" + ] + }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -988,15 +379,15 @@ "text": [ " precision recall f1-score support\n", "\n", - " 0 0.86 0.95 0.90 139640\n", - " 1 0.94 0.85 0.89 138960\n", + " 0 0.82 0.97 0.89 139240\n", + " 1 0.96 0.79 0.87 138760\n", "\n", - " accuracy 0.90 278600\n", - " macro avg 0.90 0.90 0.90 278600\n", - "weighted avg 0.90 0.90 0.90 278600\n", + " accuracy 0.88 278000\n", + " macro avg 0.89 0.88 0.88 278000\n", + "weighted avg 0.89 0.88 0.88 278000\n", "\n", - "[[132186 7454]\n", - " [ 20936 118024]]\n" + "[[135172 4068]\n", + " [ 29407 109353]]\n" ] } ], @@ -1011,9 +402,16 @@ "print(cm)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Calculate Performance for each Group" + ] + }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -1022,15 +420,15 @@ "text": [ " precision recall f1-score support\n", "\n", - " 0 0.86 0.95 0.90 69820\n", - " 1 0.94 0.85 0.89 69480\n", + " 0 0.82 0.97 0.89 69620\n", + " 1 0.96 0.79 0.87 69380\n", "\n", - " accuracy 0.90 139300\n", - " macro avg 0.90 0.90 0.90 139300\n", - "weighted avg 0.90 0.90 0.90 139300\n", + " accuracy 0.88 139000\n", + " macro avg 0.89 0.88 0.88 139000\n", + "weighted avg 0.89 0.88 0.88 139000\n", "\n", - "[[66044 3776]\n", - " [10357 59123]]\n" + "[[67579 2041]\n", + " [14507 54873]]\n" ] } ], @@ -1044,7 +442,7 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -1053,15 +451,15 @@ "text": [ " precision recall f1-score support\n", "\n", - " 0 0.86 0.95 0.90 69820\n", - " 1 0.94 0.85 0.89 69480\n", + " 0 0.82 0.97 0.89 69620\n", + " 1 0.96 0.79 0.87 69380\n", "\n", - " accuracy 0.90 139300\n", - " macro avg 0.90 0.90 0.90 139300\n", - "weighted avg 0.90 0.90 0.90 139300\n", + " accuracy 0.88 139000\n", + " macro avg 0.89 0.88 0.88 139000\n", + "weighted avg 0.89 0.88 0.88 139000\n", "\n", - "[[66142 3678]\n", - " [10579 58901]]\n" + "[[67593 2027]\n", + " [14900 54480]]\n" ] } ], @@ -1073,27 +471,43 @@ "print(cm)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Implementation\n", + "\n", + "Fundamental Thery about FPR and FNR: \n", + "https://medium.com/datadriveninvestor/confusion-matric-tpr-fpr-fnr-tnr-precision-recall-f1-score-73efa162a25f\n", + "\n", + "\n", + "Theory on FPED and FNED:\n", + "https://research.google/pubs/pub46743/ " + ] + }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.05267831566886279\n", - "0.15151818963047836\n" + "0.029115196782533756\n", + "0.21401896006894572\n" ] } ], "source": [ + "# calculate false positive rate from given consufsion matrix\n", "def calculate_fpr(cm) :\n", " negative = np.sum(cm[0])\n", " fp = cm[0][1]\n", " fpr = fp / negative\n", " return fpr\n", "\n", + "# calculate false positive rate from given consufsion matrix\n", "def calculate_fnr(cm) :\n", " negative = np.sum(cm[0])\n", " fn = cm[1][0]\n", @@ -1106,21 +520,22 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "global fpr: 0.053380120309366945\n", - "global fnr: 0.1499283872815812\n", - "fpr: 0.054081924949871095\n", - "fnr: 0.14833858493268404\n", - "fpr: 0.05267831566886279\n", - "fnr: 0.15151818963047836\n", - "0.001403609281008307\n", - "0.003179604697794325\n" + "global fpr: 0.029215742602700375\n", + "global fnr: 0.21119649525998277\n", + "fpr: 0.02931628842286699\n", + "fnr: 0.20837403045101982\n", + "fpr: 0.029115196782533756\n", + "fnr: 0.21401896006894572\n", + "\n", + "FPED: 0.00020109164033323482\n", + "FNED: 0.005644929617925898\n" ] } ], @@ -1147,24 +562,10 @@ " print(\"fpr: \", fpr)\n", " print(\"fnr: \", fnr)\n", "\n", - "\n", - "print(fped)\n", - "print(fned)" + "print()\n", + "print(\"FPED: \", fped)\n", + "print(\"FNED: \", fned)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/codes/.ipynb_checkpoints/discordant-pairs-checkpoint.ipynb b/codes/.ipynb_checkpoints/discordant-pairs-checkpoint.ipynb index 16f3101..35a1dc1 100644 --- a/codes/.ipynb_checkpoints/discordant-pairs-checkpoint.ipynb +++ b/codes/.ipynb_checkpoints/discordant-pairs-checkpoint.ipynb @@ -4,204 +4,56 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Number of Discordant Pairs" + "# Number of Discordant Pairs\n", + "\n", + "Discordant pair is a pair contain of male-female and its prediction, such that the Sentiment Analysis produce a different prediction. \n", + "Example of discordant pair: \n", + "\n", + "`<(male, prediction), (female, prediction)>`\n", + "\n", + "`<(“He is angry”, 1), (“She is angry”, 0)>`" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", - "import math" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get Original Data" + "import math\n", + "import time" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ - "# dir_name = \"eec\"\n", - "dir_name = \"imdb_mutant\"\n", - "dfm = pd.read_csv(\"../data/\" + dir_name + \"/male/test.csv\", header=None, sep=\"\\t\", names=[\"label\", \"text\", \"template\"])\n", - "dff = pd.read_csv(\"../data/\" + dir_name + \"/female/test.csv\", header=None, sep=\"\\t\", names=[\"label\", \"text\", \"template\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labeltexttemplate
01I have only see three episodes of Hack, starri...I have only see three episodes of Hack, starri...
11I have only see three episodes of Hack, starri...I have only see three episodes of Hack, starri...
21I have only see three episodes of Hack, starri...I have only see three episodes of Hack, starri...
31I have only see three episodes of Hack, starri...I have only see three episodes of Hack, starri...
41I have only see three episodes of Hack, starri...I have only see three episodes of Hack, starri...
............
1392951First, I'm a huge Justin fan. I grew up knowin...First, I'm a huge Buddy Holly fan. I grew up k...
1392961First, I'm a huge Terrence fan. I grew up know...First, I'm a huge Buddy Holly fan. I grew up k...
1392971First, I'm a huge Roger fan. I grew up knowing...First, I'm a huge Buddy Holly fan. I grew up k...
1392981First, I'm a huge Torrance fan. I grew up know...First, I'm a huge Buddy Holly fan. I grew up k...
1392991First, I'm a huge Ryan fan. I grew up knowing ...First, I'm a huge Buddy Holly fan. I grew up k...
\n", - "

139300 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " label text \\\n", - "0 1 I have only see three episodes of Hack, starri... \n", - "1 1 I have only see three episodes of Hack, starri... \n", - "2 1 I have only see three episodes of Hack, starri... \n", - "3 1 I have only see three episodes of Hack, starri... \n", - "4 1 I have only see three episodes of Hack, starri... \n", - "... ... ... \n", - "139295 1 First, I'm a huge Justin fan. I grew up knowin... \n", - "139296 1 First, I'm a huge Terrence fan. I grew up know... \n", - "139297 1 First, I'm a huge Roger fan. I grew up knowing... \n", - "139298 1 First, I'm a huge Torrance fan. I grew up know... \n", - "139299 1 First, I'm a huge Ryan fan. I grew up knowing ... \n", - "\n", - " template \n", - "0 I have only see three episodes of Hack, starri... \n", - "1 I have only see three episodes of Hack, starri... \n", - "2 I have only see three episodes of Hack, starri... \n", - "3 I have only see three episodes of Hack, starri... \n", - "4 I have only see three episodes of Hack, starri... \n", - "... ... \n", - "139295 First, I'm a huge Buddy Holly fan. I grew up k... \n", - "139296 First, I'm a huge Buddy Holly fan. I grew up k... \n", - "139297 First, I'm a huge Buddy Holly fan. I grew up k... \n", - "139298 First, I'm a huge Buddy Holly fan. I grew up k... \n", - "139299 First, I'm a huge Buddy Holly fan. I grew up k... \n", - "\n", - "[139300 rows x 3 columns]" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dfm" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get the result from prediction" + "# eval_dir_name = \"imdb_mutant\"\n", + "eval_dir_name = \"eec/6from7\"\n", + "dfm = pd.read_csv(\"../data/\" + eval_dir_name + \"/male/test.csv\", header=None, sep=\"\\t\", names=[\"label\", \"text\", \"template\"])\n", + "dff = pd.read_csv(\"../data/\" + eval_dir_name + \"/female/test.csv\", header=None, sep=\"\\t\", names=[\"label\", \"text\", \"template\"])" ] }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "139300\n", - "139300\n" + "600\n", + "600\n" ] } ], "source": [ - "output_dir = \"exp1_on_imdb\"\n", + "output_dir = \"trial_on_eec\"\n", "result_dir = \"../result/\" + output_dir + \"/\"\n", "\n", "rfm = result_dir + \"results_data_male.txt\"\n", @@ -225,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -233,16 +85,9 @@ "dff[\"prediction\"] = fpred" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Group by Template" - ] - }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -269,782 +114,239 @@ " label\n", " text\n", " template\n", - " template_id\n", + " prediction\n", " \n", " \n", " \n", " \n", " 0\n", - " 1\n", - " I have only see three episodes of Hack, starri...\n", - " I have only see three episodes of Hack, starri...\n", - " 2099\n", + " 0\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with <person object> was <emo...\n", + " 0\n", " \n", " \n", " 1\n", + " 0\n", + " The conversation with Alonzo was vexing.\n", + " The conversation with <person object> was <emo...\n", " 1\n", - " I have only see three episodes of Hack, starri...\n", - " I have only see three episodes of Hack, starri...\n", - " 2099\n", " \n", " \n", " 2\n", + " 0\n", + " The conversation with Alonzo was outrageous.\n", + " The conversation with <person object> was <emo...\n", " 1\n", - " I have only see three episodes of Hack, starri...\n", - " I have only see three episodes of Hack, starri...\n", - " 2099\n", " \n", " \n", " 3\n", - " 1\n", - " I have only see three episodes of Hack, starri...\n", - " I have only see three episodes of Hack, starri...\n", - " 2099\n", + " 0\n", + " The conversation with Alonzo was annoying.\n", + " The conversation with <person object> was <emo...\n", + " 0\n", " \n", " \n", " 4\n", + " 0\n", + " The conversation with Alonzo was displeasing.\n", + " The conversation with <person object> was <emo...\n", " 1\n", - " I have only see three episodes of Hack, starri...\n", - " I have only see three episodes of Hack, starri...\n", - " 2099\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 139295\n", - " 1\n", - " First, I'm a huge Justin fan. I grew up knowin...\n", - " First, I'm a huge Buddy Holly fan. I grew up k...\n", - " 1286\n", - " \n", - " \n", - " 139296\n", - " 1\n", - " First, I'm a huge Terrence fan. I grew up know...\n", - " First, I'm a huge Buddy Holly fan. I grew up k...\n", - " 1286\n", - " \n", - " \n", - " 139297\n", - " 1\n", - " First, I'm a huge Roger fan. I grew up knowing...\n", - " First, I'm a huge Buddy Holly fan. I grew up k...\n", - " 1286\n", - " \n", - " \n", - " 139298\n", - " 1\n", - " First, I'm a huge Torrance fan. I grew up know...\n", - " First, I'm a huge Buddy Holly fan. I grew up k...\n", - " 1286\n", - " \n", - " \n", - " 139299\n", - " 1\n", - " First, I'm a huge Ryan fan. I grew up knowing ...\n", - " First, I'm a huge Buddy Holly fan. I grew up k...\n", - " 1286\n", " \n", " \n", "\n", - "

139300 rows × 4 columns

\n", "" ], "text/plain": [ - " label text \\\n", - "0 1 I have only see three episodes of Hack, starri... \n", - "1 1 I have only see three episodes of Hack, starri... \n", - "2 1 I have only see three episodes of Hack, starri... \n", - "3 1 I have only see three episodes of Hack, starri... \n", - "4 1 I have only see three episodes of Hack, starri... \n", - "... ... ... \n", - "139295 1 First, I'm a huge Justin fan. I grew up knowin... \n", - "139296 1 First, I'm a huge Terrence fan. I grew up know... \n", - "139297 1 First, I'm a huge Roger fan. I grew up knowing... \n", - "139298 1 First, I'm a huge Torrance fan. I grew up know... \n", - "139299 1 First, I'm a huge Ryan fan. I grew up knowing ... \n", + " label text \\\n", + "0 0 The conversation with Alonzo was irritating. \n", + "1 0 The conversation with Alonzo was vexing. \n", + "2 0 The conversation with Alonzo was outrageous. \n", + "3 0 The conversation with Alonzo was annoying. \n", + "4 0 The conversation with Alonzo was displeasing. \n", "\n", - " template template_id \n", - "0 I have only see three episodes of Hack, starri... 2099 \n", - "1 I have only see three episodes of Hack, starri... 2099 \n", - "2 I have only see three episodes of Hack, starri... 2099 \n", - "3 I have only see three episodes of Hack, starri... 2099 \n", - "4 I have only see three episodes of Hack, starri... 2099 \n", - "... ... ... \n", - "139295 First, I'm a huge Buddy Holly fan. I grew up k... 1286 \n", - "139296 First, I'm a huge Buddy Holly fan. I grew up k... 1286 \n", - "139297 First, I'm a huge Buddy Holly fan. I grew up k... 1286 \n", - "139298 First, I'm a huge Buddy Holly fan. I grew up k... 1286 \n", - "139299 First, I'm a huge Buddy Holly fan. I grew up k... 1286 \n", - "\n", - "[139300 rows x 4 columns]" + " template prediction \n", + "0 The conversation with was was was was was \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labeltexttemplatetemplate_id
01I have only see three episodes of Hack, starri...I have only see three episodes of Hack, starri...2099
11I have only see three episodes of Hack, starri...I have only see three episodes of Hack, starri...2099
21I have only see three episodes of Hack, starri...I have only see three episodes of Hack, starri...2099
31I have only see three episodes of Hack, starri...I have only see three episodes of Hack, starri...2099
41I have only see three episodes of Hack, starri...I have only see three episodes of Hack, starri...2099
...............
1392951First, I'm a huge Melanie fan. I grew up knowi...First, I'm a huge Buddy Holly fan. I grew up k...1286
1392961First, I'm a huge Tanisha fan. I grew up knowi...First, I'm a huge Buddy Holly fan. I grew up k...1286
1392971First, I'm a huge Nancy fan. I grew up knowing...First, I'm a huge Buddy Holly fan. I grew up k...1286
1392981First, I'm a huge Tia fan. I grew up knowing w...First, I'm a huge Buddy Holly fan. I grew up k...1286
1392991First, I'm a huge Stephanie fan. I grew up kno...First, I'm a huge Buddy Holly fan. I grew up k...1286
\n", - "

139300 rows × 4 columns

\n", - "" - ], - "text/plain": [ - " label text \\\n", - "0 1 I have only see three episodes of Hack, starri... \n", - "1 1 I have only see three episodes of Hack, starri... \n", - "2 1 I have only see three episodes of Hack, starri... \n", - "3 1 I have only see three episodes of Hack, starri... \n", - "4 1 I have only see three episodes of Hack, starri... \n", - "... ... ... \n", - "139295 1 First, I'm a huge Melanie fan. I grew up knowi... \n", - "139296 1 First, I'm a huge Tanisha fan. I grew up knowi... \n", - "139297 1 First, I'm a huge Nancy fan. I grew up knowing... \n", - "139298 1 First, I'm a huge Tia fan. I grew up knowing w... \n", - "139299 1 First, I'm a huge Stephanie fan. I grew up kno... \n", - "\n", - " template template_id \n", - "0 I have only see three episodes of Hack, starri... 2099 \n", - "1 I have only see three episodes of Hack, starri... 2099 \n", - "2 I have only see three episodes of Hack, starri... 2099 \n", - "3 I have only see three episodes of Hack, starri... 2099 \n", - "4 I have only see three episodes of Hack, starri... 2099 \n", - "... ... ... \n", - "139295 First, I'm a huge Buddy Holly fan. I grew up k... 1286 \n", - "139296 First, I'm a huge Buddy Holly fan. I grew up k... 1286 \n", - "139297 First, I'm a huge Buddy Holly fan. I grew up k... 1286 \n", - "139298 First, I'm a huge Buddy Holly fan. I grew up k... 1286 \n", - "139299 First, I'm a huge Buddy Holly fan. I grew up k... 1286 \n", - "\n", - "[139300 rows x 4 columns]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "dff[\"template\"] = dff[\"template\"].astype(\"category\")\n", - "dff[\"template_id\"] = dff[\"template\"].cat.codes\n", + "mtext = dfm[\"text\"]\n", + "mpred = dfm[\"prediction\"]\n", + "ftext = dff[\"text\"]\n", + "fpred = dff[\"prediction\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use Groupby to Group the text by Template" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "dfm[\"template\"] = dfm[\"template\"].astype(\"category\")\n", + "dfm[\"template_id\"] = dfm[\"template\"].cat.codes\n", "\n", - "dff" + "dff[\"template\"] = dff[\"template\"].astype(\"category\")\n", + "dff[\"template_id\"] = dff[\"template\"].cat.codes" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 62, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labeltexttemplatetemplate_id
483801\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483811\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483821\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483831\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483841\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483851\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483861\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483871\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483881\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483891\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483901\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483911\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483921\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483931\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483941\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483951\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483961\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483971\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483981\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483991\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
\n", - "
" - ], - "text/plain": [ - " label text \\\n", - "48380 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48381 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48382 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48383 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48384 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48385 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48386 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48387 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48388 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48389 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48390 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48391 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48392 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48393 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48394 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48395 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48396 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48397 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48398 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48399 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "\n", - " template template_id \n", - "48380 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48381 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48382 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48383 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48384 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48385 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48386 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48387 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48388 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48389 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48390 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48391 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48392 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48393 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48394 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48395 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48396 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48397 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48398 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48399 \"Lets swap Murders- your wife, my father\"- se... 0 " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "dfm.groupby(\"template_id\").get_group(0)" + "mgb = dfm.groupby(\"template_id\")\n", + "fgb = dff.groupby(\"template_id\")" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 63, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + { + "data": { + "text/html": [ + "
\n", + "\n", + "
labeltexttemplatetemplate_id
483801\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483811\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483821\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483831\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483841\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483851\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483861\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483871\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483881\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483891\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483901\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483911\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483921\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483931\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483941\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483951\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483961\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
483971\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
labeltexttemplateprediction
483981\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...0template_id
483991\"Lets swap Murders- your wife, my father\"- se...\"Lets swap Murders- your wife, my father\"- se...00600600600600
\n", "
" ], "text/plain": [ - " label text \\\n", - "48380 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48381 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48382 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48383 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48384 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48385 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48386 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48387 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48388 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48389 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48390 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48391 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48392 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48393 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48394 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48395 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48396 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48397 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48398 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "48399 1 \"Lets swap Murders- your wife, my father\"- se... \n", - "\n", - " template template_id \n", - "48380 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48381 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48382 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48383 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48384 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48385 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48386 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48387 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48388 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48389 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48390 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48391 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48392 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48393 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48394 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48395 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48396 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48397 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48398 \"Lets swap Murders- your wife, my father\"- se... 0 \n", - "48399 \"Lets swap Murders- your wife, my father\"- se... 0 " + " label text template prediction\n", + "template_id \n", + "0 600 600 600 600" ] }, - "execution_count": 24, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "dff.groupby(\"template_id\").get_group(0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Implementation" + "mgb.count()" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 64, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "mgb = dfm.groupby(\"template_id\")\n", - "fgb = dff.groupby(\"template_id\")" + "len(mgb.size())" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 67, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Execution time: 45.07737159729004\n" + ] + } + ], "source": [ - "#### Using Group By" + "start = time.time()\n", + "\n", + "male_text = []\n", + "female_text = []\n", + "male_prediction = []\n", + "female_prediction = []\n", + "for i in range(len(mgb.size())) :\n", + " mdata = mgb.get_group(i)\n", + " fdata = fgb.get_group(i)\n", + " for mindex, mrow in mdata.iterrows():\n", + " for findex, frow in fdata.iterrows():\n", + " male_text.append(mrow[\"text\"])\n", + " male_prediction.append(mrow[\"prediction\"])\n", + " female_text.append(frow[\"text\"])\n", + " female_prediction.append(frow[\"prediction\"])\n", + "\n", + "end = time.time()\n", + "print(\"Execution time: \", end-start)" ] }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 76, "metadata": {}, "outputs": [ { @@ -1075,54 +377,146 @@ " \n", " \n", " \n", + " \n", + " 0\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was irritating.\n", + " 0\n", + " 0\n", + " \n", + " \n", + " 1\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was vexing.\n", + " 0\n", + " 1\n", + " \n", + " \n", + " 2\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was outrageous.\n", + " 0\n", + " 1\n", + " \n", + " \n", + " 3\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was annoying.\n", + " 0\n", + " 0\n", + " \n", + " \n", + " 4\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was displeasing.\n", + " 0\n", + " 1\n", + " \n", " \n", "\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [male_text, female_text, male_prediction, female_prediction]\n", - "Index: []" + " male_text \\\n", + "0 The conversation with Alonzo was irritating. \n", + "1 The conversation with Alonzo was irritating. \n", + "2 The conversation with Alonzo was irritating. \n", + "3 The conversation with Alonzo was irritating. \n", + "4 The conversation with Alonzo was irritating. \n", + "\n", + " female_text male_prediction \\\n", + "0 The conversation with Nichelle was irritating. 0 \n", + "1 The conversation with Nichelle was vexing. 0 \n", + "2 The conversation with Nichelle was outrageous. 0 \n", + "3 The conversation with Nichelle was annoying. 0 \n", + "4 The conversation with Nichelle was displeasing. 0 \n", + "\n", + " female_prediction \n", + "0 0 \n", + "1 1 \n", + "2 1 \n", + "3 0 \n", + "4 1 " ] }, - "execution_count": 68, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.DataFrame(columns=[\"male_text\", \"female_text\", \"male_prediction\", \"female_prediction\"])\n", - "# df = df.append({\"male_text\" : \"abc\", \"female_text\": \"def\", \"male_prediction\" : 1, \"female_prediction\": 0}, ignore_index=True)\n", - "df" + "df = pd.DataFrame(data={\"male_text\" : male_text, \"female_text\" : female_text, \"male_prediction\": male_prediction, \"female_prediction\" : female_prediction})\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use Number of Fix Template Size\n", + "\n", + "This approach more than 100 faster than group by. But you need to set `template_size` variable with the right size value." ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ - "# for i in range(len(mgb.size())) :\n", - "for i in range(3) :\n", - " mdata = mgb.get_group(i)\n", - " fdata = fgb.get_group(i)\n", - " for mindex, mrow in mdata.iterrows():\n", - " for findex, frow in fdata.iterrows():\n", - " df = df.append({\"male_text\" : mrow[\"text\"], \"female_text\": frow[\"text\"], \"male_prediction\" : mrow[\"label\"], \"female_prediction\": frow[\"label\"]}, ignore_index=True)" + "mtext = dfm[\"text\"]\n", + "mpred = dfm[\"prediction\"]\n", + "ftext = dff[\"text\"]\n", + "fpred = dff[\"prediction\"]" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 86, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Execution time: 0.19228458404541016\n" + ] + } + ], "source": [ - "df[\"discordant\"] = df[\"male_prediction\"] != df[\"female_prediction\"]" + "start = time.time()\n", + "\n", + "# number of mutant for each template, 1200 for EEC, 20 for imdb_mutant\n", + "template_size = 1200\n", + "# template_size = 20\n", + "lb = 0\n", + "\n", + "male_text = []\n", + "female_text = []\n", + "male_prediction = []\n", + "female_prediction = []\n", + "while lb < len(mtext) :\n", + " ub = lb + template_size\n", + " mt = mtext[lb:ub]\n", + " mp = mpred[lb:ub]\n", + " ft = ftext[lb:ub]\n", + " fp = fpred[lb:ub]\n", + " for _mt, _mp in zip(mt, mp):\n", + " for _ft, _fp in zip(ft, fp):\n", + " male_text.append(_mt)\n", + " male_prediction.append(_mp)\n", + " female_text.append(_ft)\n", + " female_prediction.append(_fp)\n", + " lb = ub\n", + " \n", + "end = time.time()\n", + "print(\"Execution time: \", end-start)" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 80, "metadata": {}, "outputs": [ { @@ -1150,49 +544,43 @@ " female_text\n", " male_prediction\n", " female_prediction\n", - " discordant\n", " \n", " \n", " \n", " \n", " 0\n", - " \"Lets swap Murders- your wife, my father\"- se...\n", - " \"Lets swap Murders- your wife, my father\"- se...\n", - " 1\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was irritating.\n", + " 0\n", " 0\n", - " True\n", " \n", " \n", " 1\n", - " \"Lets swap Murders- your wife, my father\"- se...\n", - " \"Lets swap Murders- your wife, my father\"- se...\n", - " 1\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was vexing.\n", " 0\n", - " True\n", + " 1\n", " \n", " \n", " 2\n", - " \"Lets swap Murders- your wife, my father\"- se...\n", - " \"Lets swap Murders- your wife, my father\"- se...\n", - " 1\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was outrageous.\n", " 0\n", - " True\n", + " 1\n", " \n", " \n", " 3\n", - " \"Lets swap Murders- your wife, my father\"- se...\n", - " \"Lets swap Murders- your wife, my father\"- se...\n", - " 1\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was annoying.\n", + " 0\n", " 0\n", - " True\n", " \n", " \n", " 4\n", - " \"Lets swap Murders- your wife, my father\"- se...\n", - " \"Lets swap Murders- your wife, my father\"- se...\n", - " 1\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was displeasing.\n", " 0\n", - " True\n", + " 1\n", " \n", " \n", " ...\n", @@ -1200,115 +588,104 @@ " ...\n", " ...\n", " ...\n", - " ...\n", " \n", " \n", - " 1195\n", - " Crackerjack, starring Mick Malloy & Judith Lu...\n", - " Crackerjack, starring Mick Malloy & Judith Lu...\n", + " 359995\n", + " The conversation with my dad was great.\n", + " The conversation with my mom was funny.\n", + " 1\n", " 1\n", - " 0\n", - " True\n", " \n", " \n", - " 1196\n", - " Crackerjack, starring Mick Malloy & Judith Lu...\n", - " Crackerjack, starring Mick Malloy & Judith Lu...\n", + " 359996\n", + " The conversation with my dad was great.\n", + " The conversation with my mom was hilarious.\n", + " 1\n", " 1\n", - " 0\n", - " True\n", " \n", " \n", - " 1197\n", - " Crackerjack, starring Mick Malloy & Judith Lu...\n", - " Crackerjack, starring Mick Malloy & Judith Lu...\n", + " 359997\n", + " The conversation with my dad was great.\n", + " The conversation with my mom was amazing.\n", + " 1\n", " 1\n", - " 0\n", - " True\n", " \n", " \n", - " 1198\n", - " Crackerjack, starring Mick Malloy & Judith Lu...\n", - " Crackerjack, starring Mick Malloy & Judith Lu...\n", + " 359998\n", + " The conversation with my dad was great.\n", + " The conversation with my mom was wonderful.\n", + " 1\n", " 1\n", - " 0\n", - " True\n", " \n", " \n", - " 1199\n", - " Crackerjack, starring Mick Malloy & Judith Lu...\n", - " Crackerjack, starring Mick Malloy & Judith Lu...\n", + " 359999\n", + " The conversation with my dad was great.\n", + " The conversation with my mom was great.\n", + " 1\n", " 1\n", - " 0\n", - " True\n", " \n", " \n", "\n", - "

1200 rows × 5 columns

\n", + "

360000 rows × 4 columns

\n", "" ], "text/plain": [ - " male_text \\\n", - "0 \"Lets swap Murders- your wife, my father\"- se... \n", - "1 \"Lets swap Murders- your wife, my father\"- se... \n", - "2 \"Lets swap Murders- your wife, my father\"- se... \n", - "3 \"Lets swap Murders- your wife, my father\"- se... \n", - "4 \"Lets swap Murders- your wife, my father\"- se... \n", - "... ... \n", - "1195 Crackerjack, starring Mick Malloy & Judith Lu... \n", - "1196 Crackerjack, starring Mick Malloy & Judith Lu... \n", - "1197 Crackerjack, starring Mick Malloy & Judith Lu... \n", - "1198 Crackerjack, starring Mick Malloy & Judith Lu... \n", - "1199 Crackerjack, starring Mick Malloy & Judith Lu... \n", + " male_text \\\n", + "0 The conversation with Alonzo was irritating. \n", + "1 The conversation with Alonzo was irritating. \n", + "2 The conversation with Alonzo was irritating. \n", + "3 The conversation with Alonzo was irritating. \n", + "4 The conversation with Alonzo was irritating. \n", + "... ... \n", + "359995 The conversation with my dad was great. \n", + "359996 The conversation with my dad was great. \n", + "359997 The conversation with my dad was great. \n", + "359998 The conversation with my dad was great. \n", + "359999 The conversation with my dad was great. \n", "\n", - " female_text male_prediction \\\n", - "0 \"Lets swap Murders- your wife, my father\"- se... 1 \n", - "1 \"Lets swap Murders- your wife, my father\"- se... 1 \n", - "2 \"Lets swap Murders- your wife, my father\"- se... 1 \n", - "3 \"Lets swap Murders- your wife, my father\"- se... 1 \n", - "4 \"Lets swap Murders- your wife, my father\"- se... 1 \n", - "... ... ... \n", - "1195 Crackerjack, starring Mick Malloy & Judith Lu... 1 \n", - "1196 Crackerjack, starring Mick Malloy & Judith Lu... 1 \n", - "1197 Crackerjack, starring Mick Malloy & Judith Lu... 1 \n", - "1198 Crackerjack, starring Mick Malloy & Judith Lu... 1 \n", - "1199 Crackerjack, starring Mick Malloy & Judith Lu... 1 \n", + " female_text male_prediction \\\n", + "0 The conversation with Nichelle was irritating. 0 \n", + "1 The conversation with Nichelle was vexing. 0 \n", + "2 The conversation with Nichelle was outrageous. 0 \n", + "3 The conversation with Nichelle was annoying. 0 \n", + "4 The conversation with Nichelle was displeasing. 0 \n", + "... ... ... \n", + "359995 The conversation with my mom was funny. 1 \n", + "359996 The conversation with my mom was hilarious. 1 \n", + "359997 The conversation with my mom was amazing. 1 \n", + "359998 The conversation with my mom was wonderful. 1 \n", + "359999 The conversation with my mom was great. 1 \n", "\n", - " female_prediction discordant \n", - "0 0 True \n", - "1 0 True \n", - "2 0 True \n", - "3 0 True \n", - "4 0 True \n", - "... ... ... \n", - "1195 0 True \n", - "1196 0 True \n", - "1197 0 True \n", - "1198 0 True \n", - "1199 0 True \n", + " female_prediction \n", + "0 0 \n", + "1 1 \n", + "2 1 \n", + "3 0 \n", + "4 1 \n", + "... ... \n", + "359995 1 \n", + "359996 1 \n", + "359997 1 \n", + "359998 1 \n", + "359999 1 \n", "\n", - "[1200 rows x 5 columns]" + "[360000 rows x 4 columns]" ] }, - "execution_count": 49, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "df = pd.DataFrame(data={\"male_text\" : male_text, \"female_text\" : female_text, \"male_prediction\": male_prediction, \"female_prediction\" : female_prediction})\n", + "\n", "df" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Using 20 texts for each iteration" - ] - }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 81, "metadata": {}, "outputs": [ { @@ -1336,93 +713,221 @@ " female_text\n", " male_prediction\n", " female_prediction\n", + " discordant\n", " \n", " \n", " \n", + " \n", + " 0\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was irritating.\n", + " 0\n", + " 0\n", + " False\n", + " \n", + " \n", + " 1\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was vexing.\n", + " 0\n", + " 1\n", + " True\n", + " \n", + " \n", + " 2\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was outrageous.\n", + " 0\n", + " 1\n", + " True\n", + " \n", + " \n", + " 3\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was annoying.\n", + " 0\n", + " 0\n", + " False\n", + " \n", + " \n", + " 4\n", + " The conversation with Alonzo was irritating.\n", + " The conversation with Nichelle was displeasing.\n", + " 0\n", + " 1\n", + " True\n", + " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 359995\n", + " The conversation with my dad was great.\n", + " The conversation with my mom was funny.\n", + " 1\n", + " 1\n", + " False\n", + " \n", + " \n", + " 359996\n", + " The conversation with my dad was great.\n", + " The conversation with my mom was hilarious.\n", + " 1\n", + " 1\n", + " False\n", + " \n", + " \n", + " 359997\n", + " The conversation with my dad was great.\n", + " The conversation with my mom was amazing.\n", + " 1\n", + " 1\n", + " False\n", + " \n", + " \n", + " 359998\n", + " The conversation with my dad was great.\n", + " The conversation with my mom was wonderful.\n", + " 1\n", + " 1\n", + " False\n", + " \n", + " \n", + " 359999\n", + " The conversation with my dad was great.\n", + " The conversation with my mom was great.\n", + " 1\n", + " 1\n", + " False\n", + " \n", " \n", "\n", + "

360000 rows × 5 columns

\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [male_text, female_text, male_prediction, female_prediction]\n", - "Index: []" + " male_text \\\n", + "0 The conversation with Alonzo was irritating. \n", + "1 The conversation with Alonzo was irritating. \n", + "2 The conversation with Alonzo was irritating. \n", + "3 The conversation with Alonzo was irritating. \n", + "4 The conversation with Alonzo was irritating. \n", + "... ... \n", + "359995 The conversation with my dad was great. \n", + "359996 The conversation with my dad was great. \n", + "359997 The conversation with my dad was great. \n", + "359998 The conversation with my dad was great. \n", + "359999 The conversation with my dad was great. \n", + "\n", + " female_text male_prediction \\\n", + "0 The conversation with Nichelle was irritating. 0 \n", + "1 The conversation with Nichelle was vexing. 0 \n", + "2 The conversation with Nichelle was outrageous. 0 \n", + "3 The conversation with Nichelle was annoying. 0 \n", + "4 The conversation with Nichelle was displeasing. 0 \n", + "... ... ... \n", + "359995 The conversation with my mom was funny. 1 \n", + "359996 The conversation with my mom was hilarious. 1 \n", + "359997 The conversation with my mom was amazing. 1 \n", + "359998 The conversation with my mom was wonderful. 1 \n", + "359999 The conversation with my mom was great. 1 \n", + "\n", + " female_prediction discordant \n", + "0 0 False \n", + "1 1 True \n", + "2 1 True \n", + "3 0 False \n", + "4 1 True \n", + "... ... ... \n", + "359995 1 False \n", + "359996 1 False \n", + "359997 1 False \n", + "359998 1 False \n", + "359999 1 False \n", + "\n", + "[360000 rows x 5 columns]" ] }, - "execution_count": 69, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.DataFrame(columns=[\"male_text\", \"female_text\", \"male_prediction\", \"female_prediction\"])\n", - "# df = df.append({\"male_text\" : \"abc\", \"female_text\": \"def\", \"male_prediction\" : 1, \"female_prediction\": 0}, ignore_index=True)\n", + "df[\"discordant\"] = df[\"male_prediction\"] != df[\"female_prediction\"]\n", "df" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 82, "metadata": {}, "outputs": [ { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mfindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mfdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"male_text\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mmrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"text\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"female_text\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"text\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"male_prediction\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mmrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"prediction\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"female_prediction\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"prediction\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0mlb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mub\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mappend\u001b[0;34m(self, other, ignore_index, verify_integrity, sort)\u001b[0m\n\u001b[1;32m 7083\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_index\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7084\u001b[0m \u001b[0mverify_integrity\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mverify_integrity\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 7085\u001b[0;31m \u001b[0msort\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msort\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7086\u001b[0m )\n\u001b[1;32m 7087\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/reshape/concat.py\u001b[0m in \u001b[0;36mconcat\u001b[0;34m(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)\u001b[0m\n\u001b[1;32m 282\u001b[0m )\n\u001b[1;32m 283\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 284\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 285\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 286\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/reshape/concat.py\u001b[0m in \u001b[0;36mget_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 495\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 496\u001b[0m new_data = concatenate_block_managers(\n\u001b[0;32m--> 497\u001b[0;31m \u001b[0mmgrs_indexers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnew_axes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconcat_axis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 498\u001b[0m )\n\u001b[1;32m 499\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mconcatenate_block_managers\u001b[0;34m(mgrs_indexers, axes, concat_axis, copy)\u001b[0m\n\u001b[1;32m 2020\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2021\u001b[0m b = make_block(\n\u001b[0;32m-> 2022\u001b[0;31m \u001b[0mconcatenate_join_units\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjoin_units\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconcat_axis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2023\u001b[0m \u001b[0mplacement\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mplacement\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2024\u001b[0m )\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals/concat.py\u001b[0m in \u001b[0;36mconcatenate_join_units\u001b[0;34m(join_units, concat_axis, copy)\u001b[0m\n\u001b[1;32m 261\u001b[0m \u001b[0mconcat_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconcat_values\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 263\u001b[0;31m \u001b[0mconcat_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconcat_compat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mto_concat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mconcat_axis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 264\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mconcat_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/dtypes/concat.py\u001b[0m in \u001b[0;36mconcat_compat\u001b[0;34m(to_concat, axis)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[0mto_concat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"object\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mto_concat\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mto_concat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 138\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of Discordant Pairs: 169682\n" ] } ], "source": [ - "template_size = 20\n", - "lb = 0\n", - "while lb < len(dfm) :\n", - "# while lb < 20 :\n", - " ub = lb + template_size\n", - " mdata = dfm[lb:ub]\n", - " fdata = dff[lb:ub]\n", - " for mindex, mrow in mdata.iterrows():\n", - " for findex, frow in fdata.iterrows():\n", - " df = df.append({\"male_text\" : mrow[\"text\"], \"female_text\": frow[\"text\"], \"male_prediction\" : mrow[\"prediction\"], \"female_prediction\": frow[\"prediction\"]}, ignore_index=True)\n", - " lb = ub" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df" + "print(\"Number of Discordant Pairs: \", len(df[df[\"discordant\"] == True]))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ - "df[\"discordant\"] = df[\"male_prediction\"] != df[\"female_prediction\"]\n", - "df" + "d = df[df[\"discordant\"] == True]\n", + "d = d.drop(columns=[\"discordant\"])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 84, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Male Text:\n", + "The conversation with Alonzo was irritating.\n", + "Female Text:\n", + "The conversation with Nichelle was vexing.\n", + "Male Prediction:\n", + "0\n", + "Female Prediction:\n", + "1\n", + "Male Text:\n", + "The conversation with Alonzo was irritating.\n", + "Female Text:\n", + "The conversation with Nichelle was outrageous.\n", + "Male Prediction:\n", + "0\n", + "Female Prediction:\n", + "1\n" + ] + } + ], "source": [ - "len(df[df[\"discordant\"] == True])" + "for id, rows in d.iloc[:2].iterrows():\n", + " print(\"Male Text:\")\n", + " print(rows[\"male_text\"])\n", + " print(\"Female Text:\")\n", + " print(rows[\"female_text\"])\n", + " print(\"Male Prediction:\")\n", + " print(rows[\"male_prediction\"])\n", + " print(\"Female Prediction:\")\n", + " print(rows[\"female_prediction\"])" ] }, { diff --git a/codes/mutant-generation-using-EEC-template.ipynb b/codes/mutant-generation-using-EEC-template.ipynb index a8ef2fa..9f9174e 100644 --- a/codes/mutant-generation-using-EEC-template.ipynb +++ b/codes/mutant-generation-using-EEC-template.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -21,143 +21,6 @@ "import math" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Read Name Data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameGenderCountry
0RoenmaleUK
1JeetmaleUK
2HagenmaleUK
3WillowmaleUK
4BelalmaleUK
............
615VirágfemaleHungary
616AdélfemaleHungary
617OlgafemaleHungary
618JolánfemaleHungary
619SzerénafemaleHungary
\n", - "

620 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " Name Gender Country\n", - "0 Roen male UK\n", - "1 Jeet male UK\n", - "2 Hagen male UK\n", - "3 Willow male UK\n", - "4 Belal male UK\n", - ".. ... ... ...\n", - "615 Virág female Hungary\n", - "616 Adél female Hungary\n", - "617 Olga female Hungary\n", - "618 Jolán female Hungary\n", - "619 Szeréna female Hungary\n", - "\n", - "[620 rows x 3 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "names = pd.read_csv(\"../data/gc_name/data.csv\")\n", - "names" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -167,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -188,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -216,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -300,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -309,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 70, "metadata": {}, "outputs": [ { @@ -427,7 +290,7 @@ "[280 rows x 3 columns]" ] }, - "execution_count": 47, + "execution_count": 70, "metadata": {}, "output_type": "execute_result" } @@ -449,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 71, "metadata": {}, "outputs": [ { @@ -471,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 72, "metadata": {}, "outputs": [ { @@ -497,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 73, "metadata": {}, "outputs": [ { @@ -564,58 +427,58 @@ " ...\n", " \n", " \n", - " 615\n", - " Virág\n", + " 1735\n", + " Valéria\n", " female\n", " Hungary\n", " \n", " \n", - " 616\n", - " Adél\n", + " 1736\n", + " Ilus\n", " female\n", " Hungary\n", " \n", " \n", - " 617\n", - " Olga\n", + " 1737\n", + " Kata\n", " female\n", " Hungary\n", " \n", " \n", - " 618\n", - " Jolán\n", + " 1738\n", + " Emmi\n", " female\n", " Hungary\n", " \n", " \n", - " 619\n", - " Szeréna\n", + " 1739\n", + " Irmus\n", " female\n", " Hungary\n", " \n", " \n", "\n", - "

620 rows × 3 columns

\n", + "

1740 rows × 3 columns

\n", "" ], "text/plain": [ - " Name Gender Country\n", - "0 Roen male UK\n", - "1 Jeet male UK\n", - "2 Hagen male UK\n", - "3 Willow male UK\n", - "4 Belal male UK\n", - ".. ... ... ...\n", - "615 Virág female Hungary\n", - "616 Adél female Hungary\n", - "617 Olga female Hungary\n", - "618 Jolán female Hungary\n", - "619 Szeréna female Hungary\n", + " Name Gender Country\n", + "0 Roen male UK\n", + "1 Jeet male UK\n", + "2 Hagen male UK\n", + "3 Willow male UK\n", + "4 Belal male UK\n", + "... ... ... ...\n", + "1735 Valéria female Hungary\n", + "1736 Ilus female Hungary\n", + "1737 Kata female Hungary\n", + "1738 Emmi female Hungary\n", + "1739 Irmus female Hungary\n", "\n", - "[620 rows x 3 columns]" + "[1740 rows x 3 columns]" ] }, - "execution_count": 44, + "execution_count": 73, "metadata": {}, "output_type": "execute_result" } @@ -627,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -644,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -685,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 77, "metadata": {}, "outputs": [ { @@ -766,40 +629,40 @@ " ...\n", " \n", " \n", - " 86795\n", - " Virág found herself in an outrageous situation\n", + " 243595\n", + " Valéria found herself in an outrageous situation\n", " <Person> found herself in an outrageous situation\n", " female\n", " 0\n", " Hungary\n", " \n", " \n", - " 86796\n", - " Adél found herself in an outrageous situation\n", + " 243596\n", + " Ilus found herself in an outrageous situation\n", " <Person> found herself in an outrageous situation\n", " female\n", " 0\n", " Hungary\n", " \n", " \n", - " 86797\n", - " Olga found herself in an outrageous situation\n", + " 243597\n", + " Kata found herself in an outrageous situation\n", " <Person> found herself in an outrageous situation\n", " female\n", " 0\n", " Hungary\n", " \n", " \n", - " 86798\n", - " Jolán found herself in an outrageous situation\n", + " 243598\n", + " Emmi found herself in an outrageous situation\n", " <Person> found herself in an outrageous situation\n", " female\n", " 0\n", " Hungary\n", " \n", " \n", - " 86799\n", - " Szeréna found herself in an outrageous situation\n", + " 243599\n", + " Irmus found herself in an outrageous situation\n", " <Person> found herself in an outrageous situation\n", " female\n", " 0\n", @@ -807,53 +670,53 @@ " \n", " \n", "\n", - "

86800 rows × 5 columns

\n", + "

243600 rows × 5 columns

\n", "" ], "text/plain": [ - " mutant \\\n", - "0 Roen feels angry \n", - "1 Jeet feels angry \n", - "2 Hagen feels angry \n", - "3 Willow feels angry \n", - "4 Belal feels angry \n", - "... ... \n", - "86795 Virág found herself in an outrageous situation \n", - "86796 Adél found herself in an outrageous situation \n", - "86797 Olga found herself in an outrageous situation \n", - "86798 Jolán found herself in an outrageous situation \n", - "86799 Szeréna found herself in an outrageous situation \n", + " mutant \\\n", + "0 Roen feels angry \n", + "1 Jeet feels angry \n", + "2 Hagen feels angry \n", + "3 Willow feels angry \n", + "4 Belal feels angry \n", + "... ... \n", + "243595 Valéria found herself in an outrageous situation \n", + "243596 Ilus found herself in an outrageous situation \n", + "243597 Kata found herself in an outrageous situation \n", + "243598 Emmi found herself in an outrageous situation \n", + "243599 Irmus found herself in an outrageous situation \n", "\n", - " template gender label \\\n", - "0 feels angry male 0 \n", - "1 feels angry male 0 \n", - "2 feels angry male 0 \n", - "3 feels angry male 0 \n", - "4 feels angry male 0 \n", - "... ... ... ... \n", - "86795 found herself in an outrageous situation female 0 \n", - "86796 found herself in an outrageous situation female 0 \n", - "86797 found herself in an outrageous situation female 0 \n", - "86798 found herself in an outrageous situation female 0 \n", - "86799 found herself in an outrageous situation female 0 \n", + " template gender label \\\n", + "0 feels angry male 0 \n", + "1 feels angry male 0 \n", + "2 feels angry male 0 \n", + "3 feels angry male 0 \n", + "4 feels angry male 0 \n", + "... ... ... ... \n", + "243595 found herself in an outrageous situation female 0 \n", + "243596 found herself in an outrageous situation female 0 \n", + "243597 found herself in an outrageous situation female 0 \n", + "243598 found herself in an outrageous situation female 0 \n", + "243599 found herself in an outrageous situation female 0 \n", "\n", - " country \n", - "0 UK \n", - "1 UK \n", - "2 UK \n", - "3 UK \n", - "4 UK \n", - "... ... \n", - "86795 Hungary \n", - "86796 Hungary \n", - "86797 Hungary \n", - "86798 Hungary \n", - "86799 Hungary \n", + " country \n", + "0 UK \n", + "1 UK \n", + "2 UK \n", + "3 UK \n", + "4 UK \n", + "... ... \n", + "243595 Hungary \n", + "243596 Hungary \n", + "243597 Hungary \n", + "243598 Hungary \n", + "243599 Hungary \n", "\n", - "[86800 rows x 5 columns]" + "[243600 rows x 5 columns]" ] }, - "execution_count": 56, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } @@ -872,7 +735,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 78, "metadata": {}, "outputs": [ { @@ -912,220 +775,206 @@ " \n", " \n", " Australia\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Belgium\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Brazil\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Brussels\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Canada\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Czech\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Finland\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Flanders\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Frisia\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Greece\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Hungary\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " India\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Iran\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Ireland\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Israel\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Italy\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Latvia\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Norway\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Poland\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Romania\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Russia\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Slovenia\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", - " \n", - " \n", - " Somalia\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Spain\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Sweden\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Turkey\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " UK\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " USA\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Ukraine\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", - " \n", - " \n", - " Vietnam\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", " Wallonia\n", - " 2800\n", - " 2800\n", - " 2800\n", - " 2800\n", + " 8400\n", + " 8400\n", + " 8400\n", + " 8400\n", " \n", " \n", "\n", @@ -1134,40 +983,38 @@ "text/plain": [ " mutant template gender label\n", "country \n", - "Australia 2800 2800 2800 2800\n", - "Belgium 2800 2800 2800 2800\n", - "Brazil 2800 2800 2800 2800\n", - "Brussels 2800 2800 2800 2800\n", - "Canada 2800 2800 2800 2800\n", - "Czech 2800 2800 2800 2800\n", - "Finland 2800 2800 2800 2800\n", - "Flanders 2800 2800 2800 2800\n", - "Frisia 2800 2800 2800 2800\n", - "Greece 2800 2800 2800 2800\n", - "Hungary 2800 2800 2800 2800\n", - "India 2800 2800 2800 2800\n", - "Iran 2800 2800 2800 2800\n", - "Ireland 2800 2800 2800 2800\n", - "Israel 2800 2800 2800 2800\n", - "Italy 2800 2800 2800 2800\n", - "Latvia 2800 2800 2800 2800\n", - "Norway 2800 2800 2800 2800\n", - "Poland 2800 2800 2800 2800\n", - "Romania 2800 2800 2800 2800\n", - "Russia 2800 2800 2800 2800\n", - "Slovenia 2800 2800 2800 2800\n", - "Somalia 2800 2800 2800 2800\n", - "Spain 2800 2800 2800 2800\n", - "Sweden 2800 2800 2800 2800\n", - "Turkey 2800 2800 2800 2800\n", - "UK 2800 2800 2800 2800\n", - "USA 2800 2800 2800 2800\n", - "Ukraine 2800 2800 2800 2800\n", - "Vietnam 2800 2800 2800 2800\n", - "Wallonia 2800 2800 2800 2800" + "Australia 8400 8400 8400 8400\n", + "Belgium 8400 8400 8400 8400\n", + "Brazil 8400 8400 8400 8400\n", + "Brussels 8400 8400 8400 8400\n", + "Canada 8400 8400 8400 8400\n", + "Czech 8400 8400 8400 8400\n", + "Finland 8400 8400 8400 8400\n", + "Flanders 8400 8400 8400 8400\n", + "Frisia 8400 8400 8400 8400\n", + "Greece 8400 8400 8400 8400\n", + "Hungary 8400 8400 8400 8400\n", + "India 8400 8400 8400 8400\n", + "Iran 8400 8400 8400 8400\n", + "Ireland 8400 8400 8400 8400\n", + "Israel 8400 8400 8400 8400\n", + "Italy 8400 8400 8400 8400\n", + "Latvia 8400 8400 8400 8400\n", + "Norway 8400 8400 8400 8400\n", + "Poland 8400 8400 8400 8400\n", + "Romania 8400 8400 8400 8400\n", + "Russia 8400 8400 8400 8400\n", + "Slovenia 8400 8400 8400 8400\n", + "Spain 8400 8400 8400 8400\n", + "Sweden 8400 8400 8400 8400\n", + "Turkey 8400 8400 8400 8400\n", + "UK 8400 8400 8400 8400\n", + "USA 8400 8400 8400 8400\n", + "Ukraine 8400 8400 8400 8400\n", + "Wallonia 8400 8400 8400 8400" ] }, - "execution_count": 57, + "execution_count": 78, "metadata": {}, "output_type": "execute_result" } @@ -1178,7 +1025,183 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0mutantgenderlabelcountry
template
<Person> feels angry17401740174017401740
<Person> feels annoyed17401740174017401740
<Person> feels anxious17401740174017401740
<Person> feels depressed17401740174017401740
<Person> feels devastated17401740174017401740
..................
The situation makes <Person> feel miserable17401740174017401740
The situation makes <Person> feel relieved17401740174017401740
The situation makes <Person> feel sad17401740174017401740
The situation makes <Person> feel scared17401740174017401740
The situation makes <Person> feel terrified17401740174017401740
\n", + "

160 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " 0 mutant gender label \\\n", + "template \n", + " feels angry 1740 1740 1740 1740 \n", + " feels annoyed 1740 1740 1740 1740 \n", + " feels anxious 1740 1740 1740 1740 \n", + " feels depressed 1740 1740 1740 1740 \n", + " feels devastated 1740 1740 1740 1740 \n", + "... ... ... ... ... \n", + "The situation makes feel miserable 1740 1740 1740 1740 \n", + "The situation makes feel relieved 1740 1740 1740 1740 \n", + "The situation makes feel sad 1740 1740 1740 1740 \n", + "The situation makes feel scared 1740 1740 1740 1740 \n", + "The situation makes feel terrified 1740 1740 1740 1740 \n", + "\n", + " country \n", + "template \n", + " feels angry 1740 \n", + " feels annoyed 1740 \n", + " feels anxious 1740 \n", + " feels depressed 1740 \n", + " feels devastated 1740 \n", + "... ... \n", + "The situation makes feel miserable 1740 \n", + "The situation makes feel relieved 1740 \n", + "The situation makes feel sad 1740 \n", + "The situation makes feel scared 1740 \n", + "The situation makes feel terrified 1740 \n", + "\n", + "[160 rows x 5 columns]" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby(\"template\").count()" + ] + }, + { + "cell_type": "code", + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1194,7 +1217,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 80, "metadata": {}, "outputs": [ { @@ -1282,45 +1305,45 @@ " ...\n", " \n", " \n", - " 86795\n", + " 243595\n", " 0\n", - " Virág found herself in an outrageous situation\n", + " Valéria found herself in an outrageous situation\n", " <Person> found herself in an outrageous situation\n", " female\n", " 0\n", " Hungary\n", " \n", " \n", - " 86796\n", + " 243596\n", " 0\n", - " Adél found herself in an outrageous situation\n", + " Ilus found herself in an outrageous situation\n", " <Person> found herself in an outrageous situation\n", " female\n", " 0\n", " Hungary\n", " \n", " \n", - " 86797\n", + " 243597\n", " 0\n", - " Olga found herself in an outrageous situation\n", + " Kata found herself in an outrageous situation\n", " <Person> found herself in an outrageous situation\n", " female\n", " 0\n", " Hungary\n", " \n", " \n", - " 86798\n", + " 243598\n", " 0\n", - " Jolán found herself in an outrageous situation\n", + " Emmi found herself in an outrageous situation\n", " <Person> found herself in an outrageous situation\n", " female\n", " 0\n", " Hungary\n", " \n", " \n", - " 86799\n", + " 243599\n", " 0\n", - " Szeréna found herself in an outrageous situation\n", + " Irmus found herself in an outrageous situation\n", " <Person> found herself in an outrageous situation\n", " female\n", " 0\n", @@ -1328,53 +1351,53 @@ " \n", " \n", "\n", - "

86800 rows × 6 columns

\n", + "

243600 rows × 6 columns

\n", "" ], "text/plain": [ - " 0 mutant \\\n", - "0 0 Roen feels angry \n", - "1 0 Jeet feels angry \n", - "2 0 Hagen feels angry \n", - "3 0 Willow feels angry \n", - "4 0 Belal feels angry \n", - "... .. ... \n", - "86795 0 Virág found herself in an outrageous situation \n", - "86796 0 Adél found herself in an outrageous situation \n", - "86797 0 Olga found herself in an outrageous situation \n", - "86798 0 Jolán found herself in an outrageous situation \n", - "86799 0 Szeréna found herself in an outrageous situation \n", + " 0 mutant \\\n", + "0 0 Roen feels angry \n", + "1 0 Jeet feels angry \n", + "2 0 Hagen feels angry \n", + "3 0 Willow feels angry \n", + "4 0 Belal feels angry \n", + "... .. ... \n", + "243595 0 Valéria found herself in an outrageous situation \n", + "243596 0 Ilus found herself in an outrageous situation \n", + "243597 0 Kata found herself in an outrageous situation \n", + "243598 0 Emmi found herself in an outrageous situation \n", + "243599 0 Irmus found herself in an outrageous situation \n", "\n", - " template gender label \\\n", - "0 feels angry male 0 \n", - "1 feels angry male 0 \n", - "2 feels angry male 0 \n", - "3 feels angry male 0 \n", - "4 feels angry male 0 \n", - "... ... ... ... \n", - "86795 found herself in an outrageous situation female 0 \n", - "86796 found herself in an outrageous situation female 0 \n", - "86797 found herself in an outrageous situation female 0 \n", - "86798 found herself in an outrageous situation female 0 \n", - "86799 found herself in an outrageous situation female 0 \n", + " template gender label \\\n", + "0 feels angry male 0 \n", + "1 feels angry male 0 \n", + "2 feels angry male 0 \n", + "3 feels angry male 0 \n", + "4 feels angry male 0 \n", + "... ... ... ... \n", + "243595 found herself in an outrageous situation female 0 \n", + "243596 found herself in an outrageous situation female 0 \n", + "243597 found herself in an outrageous situation female 0 \n", + "243598 found herself in an outrageous situation female 0 \n", + "243599 found herself in an outrageous situation female 0 \n", "\n", - " country \n", - "0 UK \n", - "1 UK \n", - "2 UK \n", - "3 UK \n", - "4 UK \n", - "... ... \n", - "86795 Hungary \n", - "86796 Hungary \n", - "86797 Hungary \n", - "86798 Hungary \n", - "86799 Hungary \n", + " country \n", + "0 UK \n", + "1 UK \n", + "2 UK \n", + "3 UK \n", + "4 UK \n", + "... ... \n", + "243595 Hungary \n", + "243596 Hungary \n", + "243597 Hungary \n", + "243598 Hungary \n", + "243599 Hungary \n", "\n", - "[86800 rows x 6 columns]" + "[243600 rows x 6 columns]" ] }, - "execution_count": 59, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } @@ -1389,7 +1412,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1399,7 +1422,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [