-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpublications.html
executable file
·1325 lines (1293 loc) · 76.1 KB
/
publications.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE HTML>
<html>
<head>
<title>Corpling@GU - Publications</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<link rel="stylesheet" href="assets/css/main.css" />
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no" />
<link rel="stylesheet" href="assets/css/amir_table.css" type="text/css" charset="utf-8"/>
<link rel="stylesheet" href="trublu.css" type="text/css" charset="utf-8"/>
<link rel="canonical" href="https://gucorpling.org/corpling/publications.html" />
<style type="text/css">
/*body { background-color: white; font-family: Arial, sans-serif; font-size: 13px; line-height: 1.2; padding: 1em; color: #2E2E2E; margin: auto 2em; }
*/
form#quicksearch { width: auto; border-style: solid; border-color: gray; border-width: 1px 0px; padding: 0.7em 0.5em; display:none; position:relative; }
span#searchstat {padding-left: 1em;}
div#settings { margin-top:0.7em; /* border-bottom: 1px transparent solid; background-color: #efefef; border: 1px grey solid; */ }
div#settings ul {margin: 0; padding: 0; }
div#settings li {margin: 0; padding: 0 1em 0 0; display: inline; list-style: none; }
div#settings li + li { border-left: 2px #efefef solid; padding-left: 0.5em;}
div#settings input { margin-bottom: 0px;}
div#settings.hidden {display:none;}
#showsettings { border: 1px grey solid; padding: 0 0.5em; float:right; line-height: 1.6em; text-align: right; }
#showsettings:hover { cursor: pointer; }
.invalidsearch { background-color: red; }
input[type="button"] { background-color: #efefef; border: 1px #2E2E2E solid;}
table { width: 100%; empty-cells: show; border-spacing: 0em 0.2em; margin: 1em 0em; border-style: none; }
th, td { border: 1px gray solid; border-width: 1px 1px; padding: 0.5em; vertical-align: top; text-align: left; }
th { background-color: #efefef; }
td + td, th + th { border-left: none; }
td a { color: navy; text-decoration: none; }
td a:hover { text-decoration: underline; }
.noshow { display: none;}
.highlight div { background-color: #EFEFEF; border-top: 2px #2E2E2E solid; font-weight: bold; }
.abstract div, .review div, .bibtex div { background-color: #EFEFEF; text-align: justify; border-bottom: 2px #2E2E2E solid; }
.nextshow div { border-bottom: 1px gray solid; }
.bibtex pre { width: 100%; overflow: auto; white-space: pre-wrap;}
p.infolinks { margin: 0.3em 0em 0em 0em; padding: 0px; }
@media print {
p.infolinks, #qs_settings, #quicksearch, t.bibtex { display: none !important; }
.entry { page-break-inside: avoid; }
}
</style>
</head>
<body id="publications">
<div id="wrapper">
<!-- Main -->
<div id="main">
<div class="inner" style="max-width: 1000px;">
<!-- Header -->
<header id="header">
<a href="index.html" class="logo"><strong><i class="fa fa-home"> </i>Corpling@GU</strong></a>
<ul class="icons">
<li><a href="https://wiki.gucorpling.org/" class="icon fa-wikipedia-w"><span class="label">Wiki</span></a></li>
<li><a href="https://corpus-tools.org/" class="icon fa icon-annis"><span class="label">Corpus-tools.org</span></a></li>
<li><a href="https://www.github.com/gucorpling/" class="icon fa-github"><span class="label">GitHub</span></a></li>
<!--<li><a href="" >
<span class="iconstack fa-stack">
<i class="iconstack fa fa-search fa-stack-2x"></i>
<strong class="fa-stack-1x calendar-text">CQP</strong>
</span></a></li>-->
</ul>
</header>
<section class="research">
<h1>Publications</h1>
<p>This page lists selected papers by lab members - see also a <a href="https://gucorpling.org/amir/publications.html">complete list</a> of Amir's publications</p>
<form action="" id="quicksearch">
<input type="text" id="qs_field" autocomplete="off" placeholder="Type to search..." /> <input class="small" type="button" onclick="clearQS()" value="clear" />
<span id="searchstat">Matching entries: <span id="stat">0</span></span>
<div id="showsettings" onclick="toggleSettings()">settings...</div>
<div id="settings" class="hidden">
<ul>
<li><input type="checkbox" class="search_setting" id="opt_useRegExp" onchange="updateSetting(this)"><label for="opt_useRegExp"> use RegExp</label></li>
<li><input type="checkbox" class="search_setting" id="opt_noAccents" onchange="updateSetting(this)"><label for="opt_noAccents"> ignore accents</label></li>
</ul>
</div>
</form>
<div class="CSSTableGenerator">
<div class="bibtex_template">
<div class="entry">
<div class="autolist">
<div class="if author" style="margin-top:5px; padding-left: 20px; text-indent: -20px;">
<span class="author" style="font-weight: bold; "></span>
<span class="if year">
(<span class="year"></span>),
</span>
<span class="if journal">"</span><span class="if booktitle">"</span><span class="if school"><span class="title" style="font-style: italic"></span></span><span class="if !school"><span class="title"></span></span><span class="if journal">".</span><span class="if booktitle">". </span><span class="if school">. </span>
<span class="if journal" style="">
<span class="journal" style="font-style:italic"></span>
<span class="if volume">
<span class="volume"></span>
<span class="if number">
(<span class="number"></span>)</span><span class="if pages">, <span class="pages"></span>.
</span>
</span>
</span>
<span class="if booktitle" style="">In:
<span class="booktitle" style="font-style:italic"></span>.<span class="if series" style=""> (<span class="series" style=""></span>.)</span>
<span class="if publisher">
<span class="publisher"></span>:
</span>
<span class="if address">
<span class="address"></span></span><span class="if pages">, <span class="pages"></span></span>.
</span>
<span class="if school" style="">
<span class="if type"><span class="type"></span>,</span>
<span class="school" style=""></span>.
</span>
<span class="if url">
<br/>[<a class="bibtexVar" href="+url+" style="color:black; font-size:10px" extra="url">
<span class="if comment">
<span class="comment"></span>
</span>
<span class="if !comment">
<span>link</span>
</span>
</a>]
</span>
<span class="if url2">
[<a class="bibtexVar" href="+url2+" style="color:black; font-size:10px" extra="url2">
<span class="if comment2">
<span class="comment2"></span>
</span>
<span class="if !comment2">
<span>link</span>
</span>
</a>]
</span>
<span class="if doi">
<span class="if !url"><br/></span>[<a class="bibtexVar" href="https://doi.org/+doi+" style="color:black; font-size:10px" extra="doi">DOI</a>]
</span> </div>
</div>
</div>
</div>
<textarea id="bibtex_input" style="display:none;">
@inproceedings{liu-etal-2024-gdtb,
title = "{GDTB}: Genre Diverse Data for {E}nglish Shallow Discourse Parsing across Modalities, Text Types, and Domains",
author = "Liu, Yang Janet* and
Aoyama, Tatsuya* and
Scivetti, Wesley* and
Zhu, Yilun* and
Behzad, Shabnam and
Levine, Lauren and
Lin, Jessica and
Tiwari, Devika and
Zeldes, Amir",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.684",
note = "(*equal contribution)",
pages = "12287--12303",
abstract = "Work on shallow discourse parsing in English has focused on the Wall Street Journal corpus, the only large-scale dataset for the language in the PDTB framework. However, the data is not openly available, is restricted to the news domain, and is by now 35 years old. In this paper, we present and evaluate a new open-access, multi-genre benchmark for PDTB-style shallow discourse parsing, based on the existing UD English GUM corpus, for which discourse relation annotations in other frameworks already exist. In a series of experiments on cross-domain relation classification, we show that while our dataset is compatible with PDTB, substantial out-of-domain degradation is observed, which can be alleviated by joint training on both datasets.",
}
@inproceedings{behzad-etal-2024-ask,
title = "To Ask {LLM}s about {E}nglish Grammaticality, Prompt Them in a Different Language",
author = "Behzad, Shabnam and
Zeldes, Amir and
Schneider, Nathan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.916",
pages = "15622--15634",
abstract = "In addition to asking questions about facts in the world, some internet users{---}in particular, second language learners{---}ask questions about language itself. Depending on their proficiency level and audience, they may pose these questions in an L1 (first language) or an L2 (second language). We investigate how multilingual LLMs perform at crosslingual metalinguistic question answering. Focusing on binary questions about sentence grammaticality constructed from error-annotated learner corpora, we prompt three LLMs (Aya, Llama, and GPT) in multiple languages, including English, German, Korean, Russian, and Ukrainian. Our study reveals that the language of the prompt can significantly affect model performance, and despite English being the dominant training language for all three models, prompting in a different language with questions about English often yields better results.",
}
@inproceedings{levine-zeldes-2024-unifying,
title = "Unifying the Scope of Bridging Anaphora Types in {E}nglish: Bridging Annotations in {ARRAU} and {GUM}",
author = "Levine, Lauren and
Zeldes, Amir",
editor = "Ogrodniczuk, Maciej and
Nedoluzhko, Anna and
Poesio, Massimo and
Pradhan, Sameer and
Ng, Vincent",
booktitle = "Proceedings of The Seventh Workshop on Computational Models of Reference, Anaphora and Coreference",
month = nov,
year = "2024",
address = "Miami",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.crac-1.5",
pages = "41--51",
}
@inproceedings{levine-etal-2024-lacuna,
title = "Lacuna Language Learning: Leveraging {RNN}s for Ranked Text Completion in Digitized {C}optic Manuscripts",
author = "Levine, Lauren and
Li, Cindy and
BremerMcCollum, Lydia and
Wagner, Nicholas and
Zeldes, Amir",
booktitle = "Proceedings of the 1st Workshop on Machine Learning for Ancient Languages (ML4AL 2024)",
month = aug,
year = "2024",
address = "Hybrid in Bangkok, Thailand and online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.ml4al-1.8",
doi = "10.18653/v1/2024.ml4al-1.8",
pages = "61--70",
abstract = "Ancient manuscripts are frequently damaged, containing gaps in the text known as lacunae. In this paper, we present a bidirectional RNN model for character prediction of Coptic characters in manuscript lacunae. Our best model performs with 72{\%} accuracy on single character reconstruction, but falls to 37{\%} when reconstructing lacunae of various lengths. While not suitable for definitive manuscript reconstruction, we argue that our RNN model can help scholars rank the likelihood of textual reconstructions. As evidence, we use our RNN model to rank reconstructions in two early Coptic manuscripts. Our investigation shows that neural models can augment traditional methods of textual restoration, providing scholars with an additional tool to assess lacunae in Coptic manuscripts.",
}
@article{10.1162/coli_a_00538,
author = {Zeldes, Amir and Aoyama, Tatsuya and Liu, Yang Janet and Peng, Siyao and Das, Debopam and Gessler, Luke},
title = "{eRST: A Signaled Graph Theory of Discourse Relations and Organization}",
journal = {Computational Linguistics},
pages = {1--47},
year = {2024},
month = {09},
abstract = "{In this article we present Enhanced Rhetorical Structure Theory (eRST), a new theoretical framework for computational discourse analysis, based on an expansion of Rhetorical Structure Theory (RST). The framework encompasses discourse relation graphs with tree-breaking, non-projective and concurrent relations, as well as implicit and explicit signals which give explainable rationales to our analyses. We survey shortcomings of RST and other existing frameworks, such as Segmented Discourse Representation Theory (SDRT), the Penn Discourse Treebank (PDTB) and Discourse Dependencies, and address these using constructs in the proposed theory. We provide annotation, search and visualization tools for data, and present and evaluate a freely available corpus of English annotated according to our framework, encompassing 12 spoken and written genres with over 200K tokens. Finally, we discuss automatic parsing, evaluation metrics and applications for data in our framework.}",
issn = {0891-2017},
url = {https://doi.org/10.1162/coli_a_00538},
eprint = {https://direct.mit.edu/coli/article-pdf/doi/10.1162/coli\_a\_00538/2470700/coli\_a\_00538.pdf},
}
@inproceedings{zhu-etal-2024-splice,
title = "{SPLICE}: A Singleton-Enhanced {P}ipe{LI}ne for Coreference {RE}solution",
author = "Zhu, Yilun and
Peng, Siyao and
Pradhan, Sameer and
Zeldes, Amir",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1321",
pages = "15191--15201",
abstract = "Singleton mentions, i.e. entities mentioned only once in a text, are important to how humans understand discourse from a theoretical perspective. However previous attempts to incorporate their detection in end-to-end neural coreference resolution for English have been hampered by the lack of singleton mention spans in the OntoNotes benchmark. This paper addresses this limitation by combining predicted mentions from existing nested NER systems and features derived from OntoNotes syntax trees. With this approach, we create a near approximation of the OntoNotes dataset with all singleton mentions, achieving {\textasciitilde}94{\%} recall on a sample of gold singletons. We then propose a two-step neural mention and coreference resolution system, named SPLICE, and compare its performance to the end-to-end approach in two scenarios: the OntoNotes test set and the out-of-domain (OOD) OntoGUM corpus. Results indicate that reconstructed singleton training yields results comparable to end-to-end systems for OntoNotes, while improving OOD stability (+1.1 avg. F1). We conduct error analysis for mention detection and delve into its impact on coreference clustering, revealing that precision improvements deliver more substantial benefits than increases in recall for resolving coreference chains.",
}
@inproceedings{braud-etal-2024-disrpt,
title = "{DISRPT}: A Multilingual, Multi-domain, Cross-framework Benchmark for Discourse Processing",
author = "Braud, Chlo{\'e} and
Zeldes, Amir and
Rivi{\`e}re, Laura and
Liu, Yang Janet and
Muller, Philippe and
Sileo, Damien and
Aoyama, Tatsuya",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.447",
pages = "4990--5005",
abstract = "This paper presents DISRPT, a multilingual, multi-domain, and cross-framework benchmark dataset for discourse processing, covering the tasks of discourse unit segmentation, connective identification, and relation classification. DISRPT includes 13 languages, with data from 24 corpora covering about 4 millions tokens and around 250,000 discourse relation instances from 4 discourse frameworks: RST, SDRT, PDTB, and Discourse Dependencies. We present an overview of the data, its development across three NLP shared tasks on discourse processing carried out in the past five years, and the latest modifications and added extensions. We also carry out an evaluation of state-of-the-art multilingual systems trained on the data for each task, showing plateau performance on segmentation, but important room for improvement for connective identification and relation classification. The DISRPT benchmark employs a unified format that we make available on GitHub and HuggingFace in order to encourage future work on discourse processing across languages, domains, and frameworks.",
}
@inproceedings{weissweiler-etal-2024-ucxn,
title = "{UC}xn: Typologically Informed Annotation of Constructions Atop {U}niversal {D}ependencies",
author = {Weissweiler, Leonie and
B{\"o}bel, Nina and
Guiller, Kirian and
Herrera, Santiago and
Scivetti, Wesley and
Lorenzi, Arthur and
Melnik, Nurit and
Bhatia, Archna and
Sch{\"u}tze, Hinrich and
Levin, Lori and
Zeldes, Amir and
Nivre, Joakim and
Croft, William and
Schneider, Nathan},
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1471",
pages = "16919--16932",
abstract = "The Universal Dependencies (UD) project has created an invaluable collection of treebanks with contributions in over 140 languages. However, the UD annotations do not tell the full story. Grammatical constructions that convey meaning through a particular combination of several morphosyntactic elements{---}for example, interrogative sentences with special markers and/or word orders{---}are not labeled holistically. We argue for (i) augmenting UD annotations with a {`}UCxn{'} annotation layer for such meaning-bearing grammatical constructions, and (ii) approaching this in a typologically informed way so that morphosyntactic strategies can be compared across languages. As a case study, we consider five construction families in ten languages, identifying instances of each construction in UD treebanks through the use of morphosyntactic patterns. In addition to findings regarding these particular constructions, our study yields important insights on methodology for describing and identifying constructions in language-general and language-particular ways, and lays the foundation for future constructional enrichment of UD treebanks.",
}
@inproceedings{poesio-etal-2024-universal,
title = "Universal Anaphora: The First Three Years",
author = "Poesio, Massimo and
Ogrodniczuk, Maciej and
Ng, Vincent and
Pradhan, Sameer and
Yu, Juntao and
Moosavi, Nafise Sadat and
Paun, Silviu and
Zeldes, Amir and
Nedoluzhko, Anna and
Nov{\'a}k, Michal and
Popel, Martin and
{\v{Z}}abokrtsk{\'y}, Zden{\v{e}}k and
Zeman, Daniel",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1484",
pages = "17087--17100",
abstract = "The aim of the Universal Anaphora initiative is to push forward the state of the art in anaphora and anaphora resolution by expanding the aspects of anaphoric interpretation which are or can be reliably annotated in anaphoric corpora, producing unified standards to annotate and encode these annotations, delivering datasets encoded according to these standards, and developing methods for evaluating models that carry out this type of interpretation. Although several papers on aspects of the initiative have appeared, no overall description of the initiative{'}s goals, proposals and achievements has been published yet except as an online draft. This paper aims to fill this gap, as well as to discuss its progress so far.",
}
@inproceedings{lin-zeldes-2024-gumsley,
title = "{GUM}sley: Evaluating Entity Salience in Summarization for 12 {E}nglish Genres",
author = "Lin, Jessica and
Zeldes, Amir",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.eacl-long.158",
pages = "2575--2588",
abstract = "As NLP models become increasingly capable of understanding documents in terms of coherent entities rather than strings, obtaining the most salient entities for each document is not only an important end task in itself but also vital for Information Retrieval (IR) and other downstream applications such as controllable summarization. In this paper, we present and evaluate GUMsley, the first entity salience dataset covering all named and non-named salient entities for 12 genres of English text, aligned with entity types, Wikification links and full coreference resolution annotations. We promote a strict definition of salience using human summaries and demonstrate high inter-annotator agreement for salience based on whether a source entity is mentioned in the summary. Our evaluation shows poor performance by pre-trained SOTA summarization models and zero-shot LLM prompting in capturing salient entities in generated summaries. We also show that predicting or providing salient entities to several model architectures enhances performance and helps derive higher-quality summaries by alleviating the entity hallucination problem in existing abstractive summarization.",
}
@inproceedings{zhu-etal-2023-incorporating,
title = "Incorporating Singletons and Mention-based Features in Coreference Resolution via Multi-task Learning for Better Generalization",
author = "Zhu, Yilun and
Peng, Siyao and
Pradhan, Sameer and
Zeldes, Amir",
editor = "Park, Jong C. and
Arase, Yuki and
Hu, Baotian and
Lu, Wei and
Wijaya, Derry and
Purwarianti, Ayu and
Krisnadhi, Adila Alfa",
booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = nov,
year = "2023",
address = "Nusa Dua, Bali",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.ijcnlp-short.14",
pages = "121--130",
}
@inproceedings{liu-etal-2023-whats,
title = "What{'}s Hard in {E}nglish {RST} Parsing? Predictive Models for Error Analysis",
author = "Liu, Yang Janet and
Aoyama, Tatsuya and
Zeldes, Amir",
editor = "Stoyanchev, Svetlana and
Joty, Shafiq and
Schlangen, David and
Dusek, Ondrej and
Kennington, Casey and
Alikhani, Malihe",
booktitle = "Proceedings of the 24th Meeting of the Special Interest Group on Discourse and Dialogue",
month = sep,
year = "2023",
address = "Prague, Czechia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.sigdial-1.3",
pages = "31--42",
abstract = "Despite recent advances in Natural Language Processing (NLP), hierarchical discourse parsing in the framework of Rhetorical Structure Theory remains challenging, and our understanding of the reasons for this are as yet limited. In this paper, we examine and model some of the factors associated with parsing difficulties in previous work: the existence of implicit discourse relations, challenges in identifying long-distance relations, out-of-vocabulary items, and more. In order to assess the relative importance of these variables, we also release two annotated English test-sets with explicit correct and distracting discourse markers associated with gold standard RST relations. Our results show that as in shallow discourse parsing, the explicit/implicit distinction plays a role, but that long-distance dependencies are the main challenge, while lack of lexical overlap is less of a problem, at least for in-domain parsing. Our final model is able to predict where errors will occur with an accuracy of 76.3{\%} for the bottom-up parser and 76.6{\%} for the top-down parser.",
}
@inproceedings{behzad-etal-2023-sentence,
title = "Sentence-level Feedback Generation for {E}nglish Language Learners: Does Data Augmentation Help?",
author = "Behzad, Shabnam and
Zeldes, Amir and
Schneider, Nathan",
editor = "Mille, Simon",
booktitle = "Proceedings of the 16th International Natural Language Generation Conference: Generation Challenges",
month = sep,
year = "2023",
address = "Prague, Czechia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.inlg-genchal.8",
pages = "53--59",
abstract = "In this paper, we present strong baselines for the task of Feedback Comment Generation for Writing Learning. Given a sentence and an error span, the task is to generate a feedback comment explaining the error. Sentences and feedback comments are both in English. We experiment with LLMs and also create multiple pseudo datasets for the task, investigating how it affects the performance of our system. We present our results for the task along with extensive analysis of the generated comments with the aim of aiding future studies in feedback comment generation for English language learners.",
}
@inproceedings{braud-etal-2023-disrpt,
title = "The {DISRPT} 2023 Shared Task on Elementary Discourse Unit Segmentation, Connective Detection, and Relation Classification",
author = "Braud, Chlo{\'e} and
Liu, Yang Janet and
Metheniti, Eleni and
Muller, Philippe and
Rivi{\`e}re, Laura and
Rutherford, Attapol and
Zeldes, Amir",
booktitle = "Proceedings of the 3rd Shared Task on Discourse Relation Parsing and Treebanking (DISRPT 2023)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "The Association for Computational Linguistics",
url = "https://aclanthology.org/2023.disrpt-1.1",
doi = "10.18653/v1/2023.disrpt-1.1",
pages = "1--21",
abstract = "In 2023, the third iteration of the DISRPT Shared Task (Discourse Relation Parsing and Treebanking) was held, dedicated to the underlying units used in discourse parsing across formalisms. Following the success of the 2019and 2021 tasks on Elementary Discourse Unit Segmentation, Connective Detection, and Relation Classification, this iteration has added 10 new corpora, including 2 new languages (Thai and Italian) and 3 discourse treebanks annotated in the discourse dependency representation in addition to the previously included frameworks: RST, SDRT, and PDTB. In this paper, we review the data included in the Shared Task, which covers 26 datasets across 13 languages, survey and compare submitted systems, and report on system performance on each task for both annotated and plain-tokenized versions of the data.",
}
@inproceedings{aoyama-etal-2023-gentle,
title = "{GENTLE}: A Genre-Diverse Multilayer Challenge Set for {E}nglish {NLP} and Linguistic Evaluation",
author = "Aoyama, Tatsuya and
Behzad, Shabnam and
Gessler, Luke and
Levine, Lauren and
Lin, Jessica and
Liu, Yang Janet and
Peng, Siyao and
Zhu, Yilun and
Zeldes, Amir",
booktitle = "Proceedings of the 17th Linguistic Annotation Workshop (LAW-XVII)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.law-1.17",
doi = "10.18653/v1/2023.law-1.17",
pages = "166--178",
abstract = "We present GENTLE, a new mixed-genre English challenge corpus totaling 17K tokens and consisting of 8 unusual text types for out-of-domain evaluation: dictionary entries, esports commentaries, legal documents, medical notes, poetry, mathematical proofs, syllabuses, and threat letters. GENTLE is manually annotated for a variety of popular NLP tasks, including syntactic dependency parsing, entity recognition, coreference resolution, and discourse parsing. We evaluate state-of-the-art NLP systems on GENTLE and find severe degradation for at least some genres in their performance on all tasks, which indicates GENTLE{'}s utility as an evaluation dataset for NLP systems.",
}
@inproceedings{behzad-etal-2023-elqa,
title = "{ELQA}: A Corpus of Metalinguistic Questions and Answers about {E}nglish",
author = "Behzad, Shabnam and
Sakaguchi, Keisuke and
Schneider, Nathan and
Zeldes, Amir",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-long.113",
doi = "10.18653/v1/2023.acl-long.113",
pages = "2031--2047",
abstract = "We present ELQA, a corpus of questions and answers in and about the English language. Collected from two online forums, the {\textgreater}70k questions (from English learners and others) cover wide-ranging topics including grammar, meaning, fluency, and etymology. The answers include descriptions of general properties of English vocabulary and grammar as well as explanations about specific (correct and incorrect) usage examples. Unlike most NLP datasets, this corpus is metalinguistic{---}it consists of language about language. As such, it can facilitate investigations of the metalinguistic capabilities of NLU models, as well as educational applications in the language learning domain. To study this, we define a free-form question answering task on our dataset and conduct evaluations on multiple LLMs (Large Language Models) to analyze their capacity to generate metalinguistic answers.",
}
@inproceedings{liu-zeldes-2023-gumsum,
title = "{GUMS}um: Multi-Genre Data and Evaluation for {E}nglish Abstractive Summarization",
author = "Liu, Yang Janet and
Zeldes, Amir",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.593",
doi = "10.18653/v1/2023.findings-acl.593",
pages = "9315--9327",
abstract = "Automatic summarization with pre-trained language models has led to impressively fluent results, but is prone to {`}hallucinations{'}, low performance on non-news genres, and outputs which are not exactly summaries. Targeting ACL 2023{'}s {`}Reality Check{'} theme, we present GUMSum, a small but carefully crafted dataset of English summaries in 12 written and spoken genres for evaluation of abstractive summarization. Summaries are highly constrained, focusing on substitutive potential, factuality, and faithfulness. We present guidelines and evaluate human agreement as well as subjective judgments on recent system outputs, comparing general-domain untuned approaches, a fine-tuned one, and a prompt-based approach, to human performance. Results show that while GPT3 achieves impressive scores, it still underperforms humans, with varying quality across genres. Human judgments reveal different types of errors in supervised, prompted, and human-generated summaries, shedding light on the challenges of producing a good summary.",
}
@inproceedings{liu-zeldes-2023-cant,
title = "Why Can{'}t Discourse Parsing Generalize? A Thorough Investigation of the Impact of Data Diversity",
author = "Liu, Yang Janet and
Zeldes, Amir",
booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eacl-main.227",
pages = "3112--3130",
abstract = "Recent advances in discourse parsing performance create the impression that, as in other NLP tasks, performance for high-resource languages such as English is finally becoming reliable. In this paper we demonstrate that this is not the case, and thoroughly investigate the impact of data diversity on RST parsing stability. We show that state-of-the-art architectures trained on the standard English newswire benchmark do not generalize well, even within the news domain. Using the two largest RST corpora of English with text from multiple genres, we quantify the impact of genre diversity in training data for achieving generalization to text types unseen during training. Our results show that a heterogeneous training regime is critical for stable and generalizable models, across parser architectures. We also provide error analyses of model outputs and out-of-domain performance. To our knowledge, this study is the first to fully evaluate cross-corpus RST parsing generalizability on complete trees, examine between-genre degradation within an RST corpus, and investigate the impact of genre diversity in training data composition.",
}
@inproceedings{lin-2022-leveraging,
title = "Leveraging World Knowledge in Implicit Hate Speech Detection",
author = "Lin, Jessica",
editor = "Biester, Laura and
Demszky, Dorottya and
Jin, Zhijing and
Sachan, Mrinmaya and
Tetreault, Joel and
Wilson, Steven and
Xiao, Lu and
Zhao, Jieyu",
booktitle = "Proceedings of the Second Workshop on NLP for Positive Impact (NLP4PI)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.nlp4pi-1.4",
doi = "10.18653/v1/2022.nlp4pi-1.4",
pages = "31--39",
abstract = "While much attention has been paid to identifying explicit hate speech, implicit hateful expressions that are disguised in coded or indirect language are pervasive and remain a major challenge for existing hate speech detection systems. This paper presents the first attempt to apply Entity Linking (EL) techniques to both explicit and implicit hate speech detection, where we show that such real world knowledge about entity mentions in a text does help models better detect hate speech, and the benefit of adding it into the model is more pronounced when explicit entity triggers (e.g., rally, KKK) are present. We also discuss cases where real world knowledge does not add value to hate speech detection, which provides more insights into understanding and modeling the subtleties of hate speech.",
}
@inproceedings{zeldes-etal-2022-second,
title = "A Second Wave of {UD} {H}ebrew Treebanking and Cross-Domain Parsing",
author = "Zeldes, Amir and
Howell, Nick and
Ordan, Noam and
Ben Moshe, Yifat",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.292",
pages = "4331--4344",
abstract = "Foundational Hebrew NLP tasks such as segmentation, tagging and parsing, have relied to date on various versions of the Hebrew Treebank (HTB, Sima{'}an et al. 2001). However, the data in HTB, a single-source newswire corpus, is now over 30 years old, and does not cover many aspects of contemporary Hebrew on the web. This paper presents a new, freely available UD treebank of Hebrew stratified from a range of topics selected from Hebrew Wikipedia. In addition to introducing the corpus and evaluating the quality of its annotations, we deploy automatic validation tools based on grew (Guillaume, 2021), and conduct the first cross domain parsing experiments in Hebrew. We obtain new state-of-the-art (SOTA) results on UD NLP tasks, using a combination of the latest language modelling and some incremental improvements to existing transformer based approaches. We also release a new version of the UD HTB matching annotation scheme updates from our new corpus.",
}
@inproceedings{peng-etal-2022-gcdt,
title = "{GCDT}: A {C}hinese {RST} Treebank for Multigenre and Multilingual Discourse Parsing",
author = "Peng, Siyao and
Liu, Yang Janet and
Zeldes, Amir",
booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
month = nov,
year = "2022",
address = "Online only",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.aacl-short.47",
pages = "382--391",
abstract = "A lack of large-scale human-annotated data has hampered the hierarchical discourse parsing of Chinese. In this paper, we present GCDT, the largest hierarchical discourse treebank for Mandarin Chinese in the framework of Rhetorical Structure Theory (RST). GCDT covers over 60K tokens across five genres of freely available text, using the same relation inventory as contemporary RST treebanks for English. We also report on this dataset{'}s parsing experiments, including state-of-the-art (SOTA) scores for Chinese RST parsing and RST parsing on the English GUM dataset, using cross-lingual training in Chinese and English with multilingual embeddings.",
}
@inproceedings{zabokrtsky-etal-2022-findings,
title = "Findings of the Shared Task on Multilingual Coreference Resolution",
author = "\v{Z}abokrtsk\'{y}, Zden\'{e}k and
Konop\'{i}k, Miloslav and
Nedoluzhko, Anna and
Nov\'{a}k, Michal and
Ogrodniczuk, Maciej and
Popel, Martin and
Pra\v{z}\'{a}k, Ond\v{r}ej and
Sido, Jakub and
Zeman, Daniel and
Zhu, Yilun",
booktitle = "Proceedings of the CRAC 2022 Shared Task on Multilingual Coreference Resolution",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.crac-mcr.1",
pages = "1--17",
abstract = "This paper presents an overview of the shared task on multilingual coreference resolution associated with the CRAC 2022 workshop. Shared task participants were supposed to develop trainable systems capable of identifying mentions and clustering them according to identity coreference. The public edition of CorefUD 1.0, which contains 13 datasets for 10 languages, was used as the source of training and evaluation data. The CoNLL score used in previous coreference-oriented shared tasks was used as the main evaluation metric. There were 8 coreference prediction systems submitted by 5 participating teams; in addition, there was a competitive Transformer-based baseline system provided by the organizers at the beginning of the shared task. The winner system outperformed the baseline by 12 percentage points (in terms of the CoNLL scores averaged across all datasets for individual languages).",
}
@inproceedings{levine-2022-sharing,
title = "Sharing Data by Language Family: Data Augmentation for {R}omance Language Morpheme Segmentation",
author = "Levine, Lauren",
booktitle = "Proceedings of the 19th SIGMORPHON Workshop on Computational Research in Phonetics, Phonology, and Morphology",
month = jul,
year = "2022",
address = "Seattle, Washington",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.sigmorphon-1.12",
pages = "117--123",
abstract = "This paper presents a basic character level sequence-to-sequence approach to morpheme segmentation for the following Romance languages: French, Italian, and Spanish. We experiment with adding a small set of additional linguistic features, as well as with sharing training data between sister languages for morphological categories with low performance in single language base models. We find that while the additional linguistic features were generally not helpful in this instance, data augmentation between sister languages did help to raise the scores of some individual morphological categories, but did not consistently result in an overall improvement when considering the aggregate of the categories.",
}
@inproceedings{nedoluzhko-et-al-2022-corefud,
address = {Marseille, France},
title = {{CorefUD} 1.0: Coreference Meets {U}niversal {D}ependencies},
url = {https://aclanthology.org/2022.lrec-1.520/},
abstract = {Recent advances in standardization for annotated language resources have led to successful large scale efforts, such as the Universal Dependencies (UD) project for multilingual syntactically annotated data. By comparison, the important task of coreference resolution, which clusters multiple mentions of entities in a text, has yet to be standardized in terms of data formats or annotation guidelines. In this paper we present CorefUD, a multilingual collection of corpora and a standardized format for coreference resolution, compatible with morphosyntactic annotations in the UD framework and including facilities for related tasks such as named entity recognition, which forms a first step in the direction of convergence for coreference resolution across languages.},
booktitle = {Proceedings of {LREC2022}},
publisher = {European Language Resources Association},
author = {Nedoluzhko, Anna and Nov\'{a}k, Michal and Popel, Martin and \v{Z}abokrtsk\'{y}, Zden\v{e}k and Zeldes, Amir and Zeman, Daniel },
month = jun,
year = {2022},
pages = {4859--4872},
}
@inproceedings{gessler_midas_2022,
address = {Marseille, France},
title = {Midas {Loop}: {A} {Prioritized} {Human}-in-the-{Loop} {Annotation} for {Large} {Scale} {Multilayer} {Data}},
url = {https://aclanthology.org/2022.lawxvi-1.13},
abstract = {Large scale annotation of rich multilayer corpus data is expensive and time consuming, motivating approaches that integrate high quality automatic tools with active learning in order to prioritize human labeling of hard cases. A related challenge in such scenarios is the concurrent management of automatically annotated data and human annotated data, particularly where different subsets of the data have been corrected for different types of annotation and with different levels of confidence. In this paper we present Midas Loop, a collaborative, version-controlled online annotation environment for multilayer corpus data which includes integrated provenance and confidence metadata for each piece of information at the document, sentence, token and annotation level. We present a case study on improving annotation quality in an existing multilayer parse bank of English called AMALGUM, focusing on active learning in corpus preprocessing, at the surprisingly challenging level of sentence segmentation. Our results show improvements to state-of-the-art sentence segmentation and a promising workflow for getting "silver" data to approach gold standard quality.},
booktitle = {Proceedings of {The} 16th {Lingusitic} {Annotation} {Workshop} ({LAW}-{XVI}) within {LREC2022}},
publisher = {European Language Resources Association},
author = {Gessler, Luke and Levine, Lauren and Zeldes, Amir},
month = jun,
year = {2022},
pages = {103--110},
}
@inproceedings{gessler-zeldes-2022-microbert,
title = "{M}icro{BERT}: Effective Training of Low-resource Monolingual {BERT}s through Parameter Reduction and Multitask Learning",
author = "Gessler, Luke and
Zeldes, Amir",
editor = {Ataman, Duygu and
Gonen, Hila and
Ruder, Sebastian and
Firat, Orhan and
G{\"u}l Sahin, G{\"o}zde and
Mirzakhalov, Jamshidbek},
booktitle = "Proceedings of the 2nd Workshop on Multi-lingual Representation Learning (MRL)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.mrl-1.9",
doi = "10.18653/v1/2022.mrl-1.9",
pages = "86--99",
abstract = "BERT-style contextualized word embedding models are critical for good performance in most NLP tasks, but they are data-hungry and therefore difficult to train for low-resource languages. In this work, we investigate whether a combination of greatly reduced model size and two linguistically rich auxiliary pretraining tasks (part-of-speech tagging and dependency parsing) can help produce better BERTs in a low-resource setting. Results from 7 diverse languages indicate that our model, MicroBERT, is able to produce marked improvements in downstream task evaluations, including gains up to 18{\%} for parser LAS and 11{\%} for NER F1 compared to an mBERT baseline, and we achieve these results with less than 1{\%} of the parameter count of a multilingual BERT base{--}sized model. We conclude that training very small BERTs and leveraging any available labeled data for multitask learning during pretraining can produce models which outperform both their multilingual counterparts and traditional fixed embeddings for low-resource languages.",
}
@inproceedings{lin-zeldes-2021-wikigum,
title = {{W}iki{GUM}: Exhaustive Entity Linking for Wikification in 12 Genres},
author = {Jessica Lin and Amir Zeldes},
booktitle = {Proceedings of The Joint 15th Linguistic Annotation Workshop (LAW) and 3rd Designing Meaning Representations (DMR) Workshop (LAW-DMR 2021)},
month = nov,
year = {2021},
address = {Punta Cana, Dominican Republic},
url = {https://aclanthology.org/2021.law-1.18},
pages = {170--175},
comment = {paper}
}
@InProceedings{ZeldesEtAl2021,
author = {Amir Zeldes and Yang Janet Liu and Mikel Iruskieta and Philippe Muller and Chlo\'{e} Braud and Sonia Badene},
booktitle = "Proceedings of the 2nd Shared Task on Discourse Relation Parsing and Treebanking (DISRPT 2021)",
title = {The {DISRPT} 2021 Shared Task on Elementary Discourse Unit Segmentation, Connective Detection, and Relation Classification},
year = {2021},
address = {Punta Cana, Dominican Republic},
pages = {1--12},
url = {https://aclanthology.org/2021.disrpt-1.1},
comment = {paper}
}
@inproceedings{gessler-etal-2021-discodisco,
title = "{D}is{C}o{D}is{C}o at the {DISRPT}2021 Shared Task: A System for Discourse Segmentation, Classification, and Connective Detection",
author = "Gessler, Luke and
Behzad, Shabnam and
Liu, Yang Janet and
Peng, Siyao and
Zhu, Yilun and
Zeldes, Amir",
booktitle = "Proceedings of the 2nd Shared Task on Discourse Relation Parsing and Treebanking (DISRPT 2021)",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.disrpt-1.6",
pages = "51--62",
comment = {paper}
}
@inproceedings{zhu-etal-2021-anatomy,
title = {Anatomy of {O}nto{GUM}{---}{A}dapting {GUM} to the {O}nto{N}otes Scheme to Evaluate Robustness of {SOTA} Coreference Algorithms},
author = {Yilun Zhu and Sameer Pradhan and Amir Zeldes},
booktitle = {Proceedings of the Fourth Workshop on Computational Models of Reference, Anaphora and Coreference (CRAC 2021)},
month = nov,
year = {2021},
address = {Punta Cana, Dominican Republic},
url = {https://aclanthology.org/2021.crac-1.15},
pages = {141--149},
comment = {paper}
}
@InProceedings{ZhuEtAl2021,
author = {Yilun Zhu and Sameer Pradhan and Amir Zeldes},
booktitle = {Proceedings of ACL-IJCNLP 2021},
title = {{OntoGUM}: Evaluating Contextualized {SOTA} Coreference Resolution on 12 More Genres},
year = {2021},
address = {Bangkok, Thailand},
pages = {461--467},
url = {https://aclanthology.org/2021.acl-short.59.pdf},
comment = {paper}
}
@InProceedings{ManningEtAl2021,
author = {Emma Manning and Nathan Schneider and Amir Zeldes},
booktitle = {Fifth Workshop on Teaching NLP at NAACL 2021},
title = {A Balanced and Broadly Targeted Computational Linguistics Curriculum},
year = {2021},
address = {Mexico City, Mexico},
pages = {65--69},
url = {https://www.aclweb.org/anthology/2021.teachingnlp-1.11/},
comment = {paper}
}
@InProceedings{GesslerEtAl2021,
author = {Luke Gessler and Siyao Peng and Yang Liu and Yilun Zhu and Shabnam Behzad and Amir Zeldes},
booktitle = {Proceedings of the Society for Computation in Linguistics (SCiL), Vol. 4},
title = {Overview of {AMALGUM} -- Large Silver Quality Annotations across {E}nglish Genres},
year = {2021},
address = {online},
pages = {434--437},
url = {https://scholarworks.umass.edu/scil/vol4/iss1/53/},
comment = {paper}
}
@InProceedings{ZeldesMartinTu2020,
author = {Amir Zeldes and Lance Martin and Sichang Tu},
booktitle = {Proceedings of the SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2020)},
title = {Exhaustive Entity Recognition for {C}optic: Challenges And Solutions},
year = {2020},
address = {Barcelona, Spain},
pages = {19--28},
url = {https://www.aclweb.org/anthology/2020.latechclfl-1.3.pdf},
comment = {paper}
}
@Article{ZeldesLiu2020,
author = {Amir Zeldes and Yang Liu},
journal = {Dialogue and Discourse},
title = {A Neural Approach to Discourse Relation Signal Detection},
year = {2020},
number = {2},
pages = {1-33},
volume = {11},
comment = {paper},
url = {https://journals.uic.edu/ojs/index.php/dad/article/view/11372/9733},
}
@InProceedings{GesslerEtAl2020,
author = {Luke Gessler and Siyao Peng and Yang Liu and Yilun Zhu and Shabnam Behzad and Amir Zeldes},
booktitle = {Proceedings of LREC 2020},
title = {{AMALGUM} - A Free, Balanced, Multilayer {E}nglish Web Corpus},
year = {2020},
address = {Marseille, France},
pages = {5267--5275},
comment = {paper},
url = {https://www.aclweb.org/anthology/2020.lrec-1.648.pdf},
}
@Article{SchroederZeldes2020,
author = {Caroline T. Schroeder and Amir Zeldes},
journal = {Journal of Data Mining and Digital Humanities. Special Issue on Collecting, Preserving, and Disseminating Endangered Cultural Heritage for New Understandings through Multilingual Approaches},
title = {A Collaborative Ecosystem for Digital {C}optic Studies},
year = {2020},
pages = {1--9},
comment = {paper},
url = {https://jdmdh.episciences.org/6797/pdf},
}
@InProceedings{SanguinettiBoscoCassidyEtAl2020,
author = {Manuela Sanguinetti and Cristina Bosco and Lauren Cassidy and \"{O}zlem Çetinoǧlu and Alessandra Teresa Cignarella and Teresa Lynn and Ines Rehbein and Josef Ruppenhofer and Djam\'{e} Seddah and Amir Zeldes},
booktitle = {Proceedings of LREC 2020},
title = {Treebanking User-Generated Content: A Proposal for a Unified Representation in {U}niversal {D}ependencies},
year = {2020},
address = {Marseille, France},
pages = {5240--5250},
comment = {paper},
url = {https://www.aclweb.org/anthology/2020.lrec-1.645.pdf},
}
@inproceedings{behzad-zeldes-2020-cross,
title = "A Cross-Genre Ensemble Approach to Robust {R}eddit Part of Speech Tagging",
author = "Shabnam Behzad and
Amir Zeldes",
booktitle = "Proceedings of the 12th Web as Corpus Workshop",
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://www.aclweb.org/anthology/2020.wac-1.7",
pages = "50--56",
language = "English",
ISBN = "979-10-95546-68-9",
comment = {paper}
}
@InProceedings{YuZhuLiuEtAl2019,
author = {Yue Yu and Yilun Zhu and Yang Liu and Yan Liu and Siyao Peng and Mackenzie Gong and Amir Zeldes},
title = {GumDrop at the DISRPT2019 Shared Task: A Model Stacking Approach to Discourse Unit Segmentation and Connective Detection},
booktitle = {Proceedings of Discourse Relation Treebanking and Parsing (DISRPT 2019)},
year = {2019},
pages = {133--143},
address = {Minneapolis, MN},
url = {https://www.aclweb.org/anthology/W19-2717},
comment = {paper}
}
@InProceedings{ZeldesDasMazieroEtAl2019,
author = {Amir Zeldes and Debopam Das and Erick Galani Maziero and Juliano Desiderato Antonio and Mikel Iruskieta},
title = {{The DISRPT 2019 Shared Task} on Elementary Discourse Unit Segmentation and Connective Detection},
booktitle = {Proceedings of Discourse Relation Treebanking and Parsing (DISRPT 2019)},
year = {2019},
address = {Minneapolis, MN},
pages = {97--104},
url = {https://www.aclweb.org/anthology/W19-2713},
comment = {paper}
}
@InProceedings{GesslerLiuZeldes2019Signals,
author = {Luke Gessler and Yang Liu and Amir Zeldes},
title = {A Discourse Signal Annotation System for RST Trees},
year = {2019},
booktitle = {Proceedings of the Workshop on Discourse Relation Parsing and Treebanking 2019},
address = {Minneapolis, MN},
url = {https://aclweb.org/anthology/papers/W/W19/W19-2708/},
pages = {56--61},
comment = {paper}
}
@article{liu2019discourse,
title={Discourse Relations and Signaling Information: Anchoring Discourse Signals in RST-DT},
author={Yang Liu and Amir Zeldes},
journal={Proceedings of the Society for Computation in Linguistics},
volume={2},
number={1},
pages={314--317},
year={2019}
}
@InProceedings{Gessler2019computel,
title={{Developing without developers: choosing labor-saving tools for language documentation apps}},
author={Luke Gessler},
journal={{Proceedings of the 3rd Workshop on the Use of Computational Methods in the Study of Endangered Languages}},
volume={1},
pages={6--13},
year={2019},
url={https://computel-workshop.org/wp-content/uploads/2019/02/CEL3_book_papers_draft.pdf#page=18},
comment={paper}
}
@InProceedings{ZeldesAbramsUDW2018,
author = {Amir Zeldes and Mitchell Abrams},
title = {The {C}optic Universal Dependency Treebank},
year = {2018},
booktitle = {Proceedings of the Universal Dependencies Workshop 2018 (UDW 2018)},
address = {Brussels, Belgium},
url = {http://aclweb.org/anthology/W18-6022},
pages = {192-201},
comment = {paper}
}
@InProceedings{Zeldes2018SIGMORPHON,
author = {Amir Zeldes},
title = {A Characterwise Windowed Approach to {H}ebrew Morphological Segmentation},
booktitle = {Proceedings of the 15th {SIGMORPHON} Workshop on Computational Research in Phonetics, Phonology, and Morphology},
year = {2018},
address = {Brussels, Belgium},
pages = {101-110},
url = {http://aclweb.org/anthology/W18-5811},
comment = {paper}
}
@InProceedings{Zeldes2018AACL,
author = {Amir Zeldes},
title = {A Multi-Dimensional Analysis of {RST} Discourse Relations in Eight Genres},
booktitle = {14th American Association of Corpus Linguistics Conference (AACL 2018)},
year = {2018},
address = {Atlanta, GA},
url = {https://gucorpling.org/amir/pdf/AACL_2018_MD_RST.pdf},
comment = {abstract}
}
@InProceedings{PengZeldes2018AACL,
author = {Siyao Peng and Amir Zeldes},
title = {Validating and Merging a Growing Multilayer Corpus – the Case of {GUM}},
booktitle = {14th American Association of Corpus Linguistics Conference (AACL 2018)},
year = {2018},
address = {Atlanta, GA},
url = {https://gucorpling.org/amir/pdf/AACL2018_gum_abstract.pdf},
comment = {abstract}
}
@InProceedings{FederKupreyevManningEtAl2018,
author = {Frank Feder and Maxim Kupreyev and Emma Manning and Caroline T. Schroeder and Amir Zeldes},
title = {A Linked {C}optic Dictionary Online},
booktitle = {Proceedings of LaTeCH 2018 - The 11th SIGHUM Workshop at COLING2018},
year = {2018},
address = {Santa Fe, NM},
pages = {12-21},
url = {http://aclweb.org/anthology/W18-4502},
comment = {paper}
}
@InProceedings{PengZeldes2018COLING,
author = {Siyao Peng and Amir Zeldes},
title = {All Roads Lead to {UD}: Converting Stanford and Penn Parses to {E}nglish Universal Dependencies with Multilayer Annotations},
booktitle = {{COLING} 2018 Joint Workshop on Linguistic Annotation, Multiword Expressions and Constructions ({LAW-MWE-CxG-2018})},
year = {2018},
address = {Santa Fe, NM},
pages = {167-177},
url = {http://aclweb.org/anthology/W18-4918},
comment = {paper}
}
@InCollection{Zeldes2018SC,
author = {Amir Zeldes},
title = {Compounds and Productivity in Advanced {L2} {G}erman Writing: A Constructional Approach},
booktitle = {Usage-inspired L2 Instruction: Researched Pedagogy},
year = {2018},
editor = {Lourdes Ortega and Andrea Tyler and Mariko Uno and Hae In Park},
note = {3},
publisher = {John Benjamins},
pages = {237-265},
url = {https://gucorpling.org/amir/pdf/Compounds_and_Productivity_Zeldes_revised.pdf},
comment = {prepub version},
doi = {https://doi.org/10.1075/lllt.49.11zel},
address = {Amsterdam}
}
@InProceedings{Zeldes2018CRAC,
author = {Amir Zeldes},
title = {A Predictive Model for Notional Anaphora in {E}nglish},
booktitle = {{NAACL} 2018 Workshop on Computational Models of Reference, Anaphora, and Coreference ({CRAC})},
year = {2018},
address = {New Orleans, LA},
pages = {34-43},
url = {http://aclweb.org/anthology/W18-0704},
comment = {paper}
}
@InProceedings{MacAvaneyZeldes2018,
author = {Sean MacAvaney and Amir Zeldes},
title = {A Deeper Look into Dependency-Based Word Embeddings},
booktitle = {NAACL 2018 Student Research Workshop},
year = {2018},
pages = {40-45},
url = {http://aclweb.org/anthology/N18-4006},
address = {New Orleans, LA}
}
@article{Simpson_etal,
author={Sean Simpson and Nikki Adams and Claudia M. Brugman and Thomas J. Conners},
title={Detecting Novel and Emerging Drug Terms Using Natural Language Processing: A Social Media Corpus Study},
journal={Journal of Medical Internet Research: Public Health and Surveillance},
year={2018},
volume={4},
number={1},
pages={e2},
url={http://publichealth.jmir.org/2018/1/e2/}
}
@InCollection{Simpson_2017,
author = {Belew Anna and Sean Simpson},
title = {{Language extinction then and now}},
booktitle = {{Cataloguing the Endangered Languages of the World}},
year = {2018},
editor = {Lyle Campbell and Anna Belew},
publisher = {Taylor and Francis},
address={Abingdon}
}
@InCollection{Simpson_2017a,
author = {Raina Heaton and Sean Simpson},
title = {{How ELCat Serves Communities whose Languages are at Risk}},
booktitle = {{Cataloguing the Endangered Languages of the World}},
year = {2018},
editor = {Lyle Campbell and John Van Way and Anna Belew},
publisher = {Taylor and Francis},
address={Abingdon}
}
@InCollection{Belew_2017,
author = {Anna Belew and Sean Simpson},
title = {{The Status of the World's Endangered Languages}},
booktitle = {{The Oxford Handbook of Endangered Languages}},
year = {2018},
editor = {Lyle Campbell and Kenneth Regh},
publisher = {Oxford University Press},
address={Oxford}
}
@InProceedings{Zeldes2018GURT,
author = {Amir Zeldes},
title = {A Neural Approach to Discourse Relation Signaling},
booktitle = {Georgetown University Round Table (GURT) 2018: Approaches to Discourse},
year = {2018},
address = {Washington, DC},
url = {https://gucorpling.org/amir/pdf/neural_DR_abstract_GURT2018.pdf},
comment = {abstract},
url2 = {https://gucorpling.org/amir/pdf/RST_GURT2018_final.pdf},
comment2 = {slides}
}
@InProceedings{Simpson2017,
author = {Sean Simpson},
title = {{Reconstructing Thread Structure For Reddit Comments}},
journal = {{Mid-Atlantic Student Colloquium on Speech, Language and Learning (MASC-SLL)}},
address = {{Washington, DC}},
year = {2017}
}
@PhdThesis{Zhang2017,
type = {PhD Thesis},
title = {Mining Linguistic Tone Patterns Using Fundamental Frequency Time-Series Data},
author = {Shuo Zhang},
school = {Georgetown University},
url = {https://repository.library.georgetown.edu/handle/10822/1047816},
year = {2017}
}
@InProceedings{Zeldes2017a,
author = {Amir Zeldes},
title = {A Distributional View of Discourse Encapsulation: Multifactorial Prediction of Coreference Density in {RST}},
booktitle = {6th Workshop on Recent Advances in RST and Related Formalisms at INLG},
year = {2017},
pages = {20-28},
address = {Santiago de Compostela, Spain},
url = {http://aclweb.org/anthology/W17-3603},
comment = {paper}
}
@InProceedings{ZhangZeldes2017,
author = {Shuo Zhang and Amir Zeldes},
title = {{GitDOX}: A Linked Version Controlled Online {XML} Editor for Manuscript Transcription},
booktitle = {Proceedings of FLAIRS-30},
year = {2017},
pages = {619-623},
address = {Marco Island, FL},
url = {https://gucorpling.org/amir/pdf/GitDOX_A_Linked_Version_Controlled_Online_XML_Editor_prepub.pdf},