Code for the ACL 2021 paper A Systematic Investigation of KB-Text Embedding Alignment at Scale paper link slides
Authors: Vardaan Pahuja, Yu Gu, Wenhu Chen, Mehdi Bahrami, Lei Liu, Wei-Peng Chen and Yu Su
This implementation is based on the DGL-KE and Wikipedia2Vec libraries.
pip install -r requirements.txt
cd wikinew/
./cythonize.sh
python setup.py install
-
Download Wikidata Dec. 2020 triple files from here and store directory path into environment variable
WIKIDATA_TRIPLES_DIR
-
Download pre-processed Wikipedia files from here and store directory path into environment variable
WIKIPEDIA_PROC_DATA
-
Download Few-shot link prediction dataset from here and store directory path in the environment variable
WIKIDATA_FS_LP_DIR
-
Download Analogical Reasoning dataset from here and store directory path in the environment variable
ANALOGY_DATASET_DIR
-
Download
wikipedia_links.json
from here and save it in the dir. corresponding to the environment variableWIKIDATA_PROC_JSON_DIR
. -
Download
rel_type_dict.pickle
from here and save it in the dir. corresponding to the environment variableWIKIDATA_PROC_JSON_DIR
. -
Download
entity_child_dict.pickle
from here and save it in the dir. corresponding to the environment variableWIKIDATA_PROC_JSON_DIR
. -
Download
ent_counter_names.json
from here and save it in the dir. corresponding to the environment variableWIKIDATA_PROC_JSON_DIR
. -
Download Wikidata March 2020 triple files from here and store directory path in the environment variable
WIKIDATA_MAR_20_TRIPLES_DIR
-
Download the COVID case-study triples dir. from here and and store the dir. name in the environment variable
COVID_TRIPLES_DIR
.
This step is not needed if you download the pre-processed data as above.
- Download Wikidata raw dump file from the given link (Dec. 2020 version or March 2020 version) and set environment variable
RAW_WIKIDATA_JSON_FILE
to its path. - Download Wikipedia raw dump file from here and set environment variable
DUMP_FILE
to its path.
mkdir $WIKIDATA_PROC_JSON_DIR
mkdir $WIKIDATA_TRIPLES_DIR
python utils/create_proc_wikidata.py --input_json_file $RAW_WIKIDATA_JSON_FILE --out_dir $WIKIDATA_PROC_JSON_DIR
python utils/generate_triples.py $WIKIDATA_PROC_JSON_DIR $WIKIDATA_TRIPLES_DIR/triples.tsv
# We shuffle triples.tsv and split it into train-valid-test files (wikidata_train.tsv wikidata_valid.tsv wikidata_test.tsv) in the ratio 0.85:0.075:0.075 for training the embeddings corresponding to the Analogical Reasoning experiment.
# We use the triples.tsv as the set of training triples for COVID-19 case study.
python utils/create_wikipedia_wikidata_link_dict.py --input_json_file $RAW_WIKIDATA_JSON_FILE --out_links_file $WIKIDATA_PROC_JSON_DIR/wikipedia_links.json
python utils/create_entity_type_dict.py --wikidata-triples-file $WIKIDATA_TRIPLES_DIR/wikidata_train.tsv --out-dir $WIKIDATA_PROC_JSON_DIR
python utils/create_rel_type_dict.py --wikidata-triples-file $WIKIDATA_TRIPLES_DIR/wikidata_train.tsv --entity-type-dict-file $WIKIDATA_PROC_JSON_DIR/entity_type_dict.json --out-dir $WIKIDATA_PROC_JSON_DIR
python utils/create_counter_domain_intersection.py --triples-file $WIKIDATA_TRIPLES_DIR/wikidata_train.tsv --wiki-link-file $WIKIDATA_PROC_JSON_DIR/wikipedia_links.json --entity-file $WIKIDATA_TRIPLES_DIR/entities.tsv --dict-file $WIKIPEDIA_PROC_DATA/dict_file --out-dir $WIKIDATA_PROC_JSON_DIR
mkdir $WIKIPEDIA_PROC_DATA
wikipedia2vec build-dump-db $DUMP_FILE $WIKIPEDIA_PROC_DATA/db_file
wikipedia2vec build-dictionary $WIKIPEDIA_PROC_DATA/db_file $WIKIPEDIA_PROC_DATA/dict_file
wikipedia2vec build-link-graph $WIKIPEDIA_PROC_DATA/db_file $WIKIPEDIA_PROC_DATA/dict_file $WIKIPEDIA_PROC_DATA/link_graph_file
wikipedia2vec build-mention-db $WIKIPEDIA_PROC_DATA/db_file $WIKIPEDIA_PROC_DATA/dict_file $WIKIPEDIA_PROC_DATA/mentiondb_file
Set the environment variables WIKIDATA_FS_LP_DIR
, WIKIDATA_PROC_JSON_DIR
, WIKIPEDIA_PROC_DATA
, SAVE_DIR
, $BALANCE_PARAM and navigate to the directory of the desired kb-text alignment method.
python train.py --model_name TransE_l2 --batch_size 1000 --log_interval 10000 --neg_sample_size 200 --regularization_coef=1e-9 --hidden_dim 300 --gamma 19.9 --lr 0.25 --batch_size_eval 16 --data_path $WIKIDATA_FS_LP_DIR --data_files wikidata_train_full.tsv wikidata_test.tsv wikidata_test.tsv --format raw_udd_hrt --dump-db-file $WIKIPEDIA_PROC_DATA/db_file --dictionary-file $WIKIPEDIA_PROC_DATA/dict_file --mention-db-file $WIKIPEDIA_PROC_DATA/mention_db_file --link-graph-file $WIKIPEDIA_PROC_DATA/link_graph_file --num_thread 1 --neg_deg_sample --save_path $SAVE_DIR --balance_param $BALANCE_PARAM --reg-loss-start-epoch 0 --n_iters 20 --num_proc 8 --num_proc_train 32 --timeout 200 --wiki-link-file $WIKIDATA_PROC_JSON_DIR/wikipedia_links.json
python train.py --model_name TransE_l2 --batch_size 1000 --log_interval 10000 --neg_sample_size 200 --regularization_coef=1e-9 --hidden_dim 300 --gamma 19.9 --lr 0.25 --batch_size_eval 16 --data_path $WIKIDATA_FS_LP_DIR --data_files wikidata_train_support.tsv wikidata_test.tsv wikidata_test.tsv --format raw_udd_hrt --dump-db-file $WIKIPEDIA_PROC_DATA/db_file --dictionary-file $WIKIPEDIA_PROC_DATA/dict_file --mention-db-file $WIKIPEDIA_PROC_DATA/mention_db_file --link-graph-file $WIKIPEDIA_PROC_DATA/link_graph_file --num_thread 1 --neg_deg_sample --save_path $SAVE_DIR --balance_param $BALANCE_PARAM --reg-loss-start-epoch 0 --n_iters 20 --num_proc 8 --num_proc_train 32 --timeout 200 --wiki-link-file $WIKIDATA_PROC_JSON_DIR/wikipedia_links.json
python eval_type_constraint.py --model_name TransE_l2 --hidden_dim 300 --gamma 19.9 --batch_size_eval 16 --data_path $WIKIDATA_FS_LP_DIR --data_files wikidata_train_full.tsv wikidata_test.tsv wikidata_test.tsv --format raw_udd_hrt --num_thread 1 --num_proc 1 --neg_sample_size_eval 1000 --test-triples-file $WIKIDATA_FS_LP_DIR/wikidata_test_support.tsv --model_path $SAVE_DIR/ --rel-type-dict-file $WIKIDATA_PROC_JSON_DIR/rel_type_dict.pickle --entity-child-dict-file $WIKIDATA_PROC_JSON_DIR/entity_child_dict.json --sampler-type both
python eval_type_constraint.py --model_name TransE_l2 --hidden_dim 300 --gamma 19.9 --batch_size_eval 16 --data_path $WIKIDATA_FS_LP_DIR --data_files wikidata_train_full.tsv wikidata_test.tsv wikidata_test.tsv --format raw_udd_hrt --num_thread 1 --num_proc 1 --neg_sample_size_eval 1000 --test-triples-file $WIKIDATA_FS_LP_DIR/wikidata_test_missing_support.tsv --model_path $SAVE_DIR/ --rel-type-dict-file $WIKIDATA_PROC_JSON_DIR/rel_type_dict.pickle --entity-child-dict-file $WIKIDATA_PROC_JSON_DIR/entity_child_dict.json --sampler-type both
Set the environment variables WIKIDATA_TRIPLES_DIR
, WIKIDATA_PROC_JSON_DIR
, WIKIPEDIA_PROC_DATA
, SAVE_DIR
, $BALANCE_PARAM and navigate to the directory of the desired kb-text alignment method.
python train.py --model_name TransE_l2 --batch_size 1000 --log_interval 10000 --neg_sample_size 200 --regularization_coef=1e-9 --hidden_dim 300 --gamma 19.9 --lr 0.25 --batch_size_eval 16 --data_path $WIKIDATA_TRIPLES_DIR --data_files wikidata_train.tsv wikidata_valid.tsv wikidata_test.tsv --format raw_udd_hrt --dump-db-file $WIKIPEDIA_PROC_DATA/db_file --dictionary-file $WIKIPEDIA_PROC_DATA/dict_file --mention-db-file $WIKIPEDIA_PROC_DATA/mention_db_file --link-graph-file $WIKIPEDIA_PROC_DATA/link_graph_file --num_thread 1 --neg_deg_sample --save_path $SAVE_DIR --balance_param $BALANCE_PARAM --reg-loss-start-epoch 0 --n_iters 20 --num_proc 8 --num_proc_train 32 --timeout 200 --wiki-link-file $WIKIDATA_PROC_JSON_DIR/wikipedia_links.json
sh utils/analogy_complete_exp.sh
The pre-trained embeddings for each of the 4 alignment methods can be downloaded below. The description of the filenames is as follows:
- TransE_l2_emb0_sg.npy: Skip-gram Embeddings for (words + entities), Word ID to name mapping file Entity ID to name mapping file
- TransE_l2_entity.npy: TransE embeddings for entities, Entity ID to name mapping file
- TransE_l2_relation.npy: TransE embeddings for relations, Relation ID to name mapping file
python train.py --model_name TransE_l2 --batch_size 1000 --log_interval 10000 --neg_sample_size 200 --regularization_coef=1e-9 --hidden_dim 300 --gamma 19.9 --lr 0.25 --batch_size_eval 16 --data_path $WIKIDATA_MAR_20_TRIPLES_DIR --data_files wikidata_train.tsv --format raw_udd_hrt --dump-db-file $WIKIPEDIA_PROC_DATA/db_file --dictionary-file $WIKIPEDIA_PROC_DATA/dict_file --mention-db-file $WIKIPEDIA_PROC_DATA/mention_db_file --link-graph-file $WIKIPEDIA_PROC_DATA/link_graph_file --num_thread 1 --neg_deg_sample --save_path $SAVE_DIR --balance_param $BALANCE_PARAM --reg-loss-start-epoch 0 --n_iters 20 --num_proc 8 --num_proc_train 32 --timeout 200 --wiki-link-file $WIKIDATA_MAR_20_PROC_JSON_DIR/wikipedia_links.json
# P5642 (Risk Factor)
python -u eval_type_constraint.py --model_name TransE_l2 --hidden_dim 300 --gamma 19.9 --batch_size_eval 1 --data_path $WIKIDATA_MAR_20_TRIPLES_DIR --data_files wikidata_train.tsv wikidata_test.tsv wikidata_test.tsv --format udd_hrt --num_thread 1 --num_proc 1 --neg_sample_size_eval 1000 --test-triples-file $COVID_TRIPLES_DIR/wikidata_test_covid_P5642.tsv --model_path $SAVE_DIR --rel-type-dict-file $WIKIDATA_MAR_20_PROC_JSON_DIR/rel_type_dict.pickle --entity-child-dict-file $WIKIDATA_MAR_20_PROC_JSON_DIR/entity_child_dict.json --sampler-type tail
# P780 (Symptoms)
python -u eval_type_constraint.py --model_name TransE_l2 --hidden_dim 300 --gamma 19.9 --batch_size_eval 1 --data_path $WIKIDATA_MAR_20_TRIPLES_DIR --data_files wikidata_train.tsv wikidata_test.tsv wikidata_test.tsv --format udd_hrt --num_thread 1 --num_proc 1 --neg_sample_size_eval 1000 --test-triples-file $COVID_TRIPLES_DIR/wikidata_test_covid_P780.tsv --model_path $SAVE_DIR --rel-type-dict-file $WIKIDATA_MAR_20_PROC_JSON_DIR/rel_type_dict.pickle --entity-child-dict-file $WIKIDATA_MAR_20_PROC_JSON_DIR/entity_child_dict.json --sampler-type tail
# P509 (Cause of death)
python -u eval_type_constraint.py --model_name TransE_l2 --hidden_dim 300 --gamma 19.9 --batch_size_eval 1 --data_path $WIKIDATA_MAR_20_TRIPLES_DIR --data_files wikidata_train.tsv wikidata_test.tsv wikidata_test.tsv --format udd_hrt --num_thread 1 --num_proc 1 --neg_sample_size_eval 1000 --test-triples-file $COVID_TRIPLES_DIR/wikidata_test_covid_P509.tsv --model_path $SAVE_DIR --rel-type-dict-file $WIKIDATA_MAR_20_PROC_JSON_DIR/rel_type_dict.pickle --entity-child-dict-file $WIKIDATA_MAR_20_PROC_JSON_DIR/entity_child_dict.json --sampler-type head
# P1050 (medical condition)
python -u eval_type_constraint.py --model_name TransE_l2 --hidden_dim 300 --gamma 19.9 --batch_size_eval 1 --data_path $WIKIDATA_MAR_20_TRIPLES_DIR --data_files wikidata_train.tsv wikidata_test.tsv wikidata_test.tsv --format udd_hrt --num_thread 1 --num_proc 1 --neg_sample_size_eval 1000 --test-triples-file $COVID_TRIPLES_DIR/wikidata_test_covid_P1050.tsv --model_path $SAVE_DIR --rel-type-dict-file $WIKIDATA_MAR_20_PROC_JSON_DIR/rel_type_dict.pickle --entity-child-dict-file $WIKIDATA_MAR_20_PROC_JSON_DIR/entity_child_dict.json --sampler-type head