-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbibstring
executable file
·283 lines (254 loc) · 9.57 KB
/
bibstring
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#!/usr/bin/env python3
import argparse
import re
import gzip
import datetime
IGNORELIST = [
"ab", "and", "at", "for", "in", "of", "on", "to", "the", "um",
"with", "&"
]
abbreviations = {
"January" : "Jan.",
"February" : "Feb.",
"March" : "Mar.",
"April" : "Apr.",
"June" : "Jun.",
"July" : "Jul.",
"August" : "Aug.",
"September" : "Sept.",
"October" : "Oct.",
"November" : "Nov.",
"December" : "Dec.",
"first" : "1st",
"Association Nationale de la Recherche Technique" : "Association Nationale de la Recherche Technique",
"Brooks/Cole Publishing Company" : "Brooks/Cole Publ. Co.",
"O'Reilly \& Associates" : "O'Reilly \& Assoc.",
", Alaska," : ", AK,",
", Colorado," : ", CO,",
", California," : ", CA,",
", District of Columbia," : ", DC,",
", Florida," : ", FL,",
", Hawaii," : ", HI,",
", Illinois," : ", IL,",
", Louisiana," : ", LA,",
", Massachusetts," : ", MA,",
", Nevada," : ", NV,",
", Ohio," : ", OH,",
", New York," : ", NY,",
", Pennsylvania," : ", PA,",
", Rhode Island," : ", RI,",
", Tennessee," : ", TN,",
", Texas," : ", TX,",
", Utah," : ", UT,",
", Washington," : ", WA,",
}
abbreviations_constant = [
"Addison-Wesley",
]
def replace_abbreviations(input_string):
"""
Replaces a string that matches a key in a dictionary with the corresponding key.
Arguments:
input_string -- a single string
Returns:
A boolearn telling if a substitution has occurred and A string with a substitution
"""
for string in abbreviations_constant:
if input_string == string:
return True, input_string
elif any(substring == string for substring in input_string.split(", ")):
return True, input_string
elif any(substring == string for substring in input_string.split("-")):
return True, input_string
elif any(substring == string for substring in input_string.split("\' ")):
return True, input_string
elif any(substring == string for substring in input_string.split(" \'")):
return True, input_string
matchFound = False
output_string = input_string
for key, value in abbreviations.items():
pattern = re.escape(key) # Escape special characters in key
match = re.search(pattern, output_string)
if match:
output_string = output_string[:match.start()] + value + output_string[match.end():]
matchFound = True
if matchFound:
return True, output_string
else:
return False, input_string
def load_abbrev(fname):
# Load the abbreviations database into memory
data = {}
try:
with gzip.open(fname, 'rt', encoding="utf-16") as f:
for line in f:
# usually the first line starts with WORD
if line.startswith('WORD'):
continue
parts = line.split("\t")
langs = parts[2].split(", ")
jname = parts[0]
jabbrev = parts[1]
data[jname.lower()] = jabbrev.lower()
except FileNotFoundError:
raise FileNotFoundError("The abbreviations file is not available.")
return data
def journal_abbrev(name, data):
"""
Abbreviates a journal title
"""
# First we check if we should not skip a string
replaced, output_string = replace_abbreviations(name)
if replaced:
return output_string
# Not match was found in the dictionary, so proceed.
n_abbrev = []
(name1, _, name2) = name.partition(": ")
parts = re.split("\s+", name1)
if len(name2) >= 1:
parts2 = re.split("\s+", name2)
parts = parts + [":"] + parts2
if len(parts) == 1 and len(parts[0]) < 12:
return name
for word in parts:
# Do not abbreviate wordsin the IGNORELIST
if word.lower() in IGNORELIST:
continue
for (k,v) in data.items():
found = False
# If the key ends with - it means we are checking for a prefix
if k.endswith("-"):
if word.lower().startswith(k[:-1]):
if v != "n.a.":
n_abbrev.append(v.capitalize())
else:
n_abbrev.append(word.lower().capitalize())
found = True
break
# Else we are checking for a whole match
else:
if word.lower() == k:
if v != "n.a.":
n_abbrev.append(v.capitalize())
else:
n_abbrev.append(word.lower().capitalize())
found = True
break
if not found:
# If all characters are uppercase leave as is
if not word.isupper():
n_abbrev.append(word.capitalize())
else:
n_abbrev.append(word)
str = " ".join(n_abbrev)
if " : " in str:
str = str.replace(" : ", ": ")
# Corrections due to mistakes in the previous translation
if "Remote. " in str:
str = str.replace("Remote. ", "Remote ")
if "~e" in str:
str = str.replace("~e", "~E")
if "~f" in str:
str = str.replace("~f", "~F")
if "~i" in str:
str = str.replace("~i", "~I")
if "Vision," in str:
str = str.replace("Vision,", "Vis.")
if "(ijes)" in str:
str = str.replace("ijes", "iJES")
if "Plos" in str:
str = str.replace("Plos", "PLoS")
if "Photo-instrumentation" in str:
str = str.replace("Photo-instrumentation", "Photo-Instrumentation")
if "Cyber-physical" in str:
str = str.replace("Cyber-physical", "Cyber-Physical")
if "Real-time" in str:
str = str.replace("Real-time", "Real-Time")
if "Cvsports" in str:
str = str.replace("Cvsports", "CVsports")
if "-computer" in str:
str = str.replace("-computer", "-Comput.")
if "Networks," in str:
str = str.replace("Networks,", "Networks")
if "Networks." in str:
str = str.replace("Networks.", "Networks")
if "Music." in str:
str = str.replace("Music.", "Music")
if "Music." in str:
str = str.replace("Music.", "Music")
if "Control." in str:
str = str.replace("Control.", "Control")
if "Uavision" in str:
str = str.replace("Uavision", "UAVision")
if "V Work." in str:
str = str.replace("V Work.", "V Work.)")
if "P Work." in str:
str = str.replace("P Work.", "P Work.)")
if "Ustralasian" in str:
str = str.replace("Ustralasian", "Australasian")
if "mmsports" in str:
str = str.replace("mmsports", "MMSports")
if "neurips" in str:
str = str.replace("neurips", "NeurIPS")
if "" in str:
str = str.replace("", "")
return str
def abbreviate(s, data):
"""
This function takes a string s as input, applies some transformation to it,
and returns the transformed string.
"""
jabbr = journal_abbrev(s, data)
return(jabbr)
def process_file(input_file):
"""
This function reads lines from the input file and processes them according
to the specification. It writes the processed lines to an output file.
"""
# Extract the file name prefix from the input file name
prefix = input_file[:-4]
# Open the input file for reading
with open(input_file, 'r') as f:
# Open the output file for writing
with open(f"{prefix}-short.bib", 'w') as out:
# Open the list with abbreviations
data = load_abbrev("abbrev.txt.gz")
# Print a header
current_time = datetime.datetime.now()
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
out.write("File generated by bibstring on " + formatted_time + "\n")
# Read each line from the input file
for line in f:
# Check if the line contains a string definition
if ( ("@STRING{" in line) or ("@string{" in line) ) and ("#" not in line):
# Extract the string from the line using regex
match = re.search(r'"(.*?)"', line)
if match:
# Apply some transformation to the string
string = abbreviate(match.group(1), data)
# Replace the original string with the transformed one
line = line.replace(match.group(1), string)
# Write the processed line to the output file
out.write(line)
def main():
# Set up the argument parser
parser = argparse.ArgumentParser(description="Process the input.bib file to change all @STRING macros by an abbreviated string. \nOutput file is input-short.bib")
parser.add_argument("input", metavar="input.bib", type=str, help="BiBTeX file input.bib to be parsed")
parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
# Call the greet function with the name
if args.input:
# Check if the file name matches the pattern "?*.bib"
if not re.match(r".+\.bib$", args.input):
print("Error: The input file must have a .bib extension.")
return
# Call the apply function with the file name
if args.verbose:
print("Starting processing...")
process_file(args.input)
if args.verbose:
print("Processing complete.")
else:
parser.print_help()
if __name__ == "__main__":
main()