\n", + " | id | \n", + "title | \n", + "link | \n", + "language | \n", + "captures_total | \n", + "urls_uniq_estimate | \n", + "warc_size_in_bytes | \n", + "content_languages | \n", + "content_types | \n", + "captures_per_year | \n", + "captures_per_crawl | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "162 | \n", + "aeh2 | \n", + "http://www.aeh2.org | \n", + "es | \n", + "3039 | \n", + "1233 | \n", + "80847151 | \n", + "{\"eng\":7,\"eng,spa\":307,\"spa\":767,\"spa,cat\":1,\"... | \n", + "{\"application/pdf\":19,\"application/xhtml+xml\":... | \n", + "{\"2020\":1762,\"2021\":1277} | \n", + "{\"CC-MAIN-2020-05\":195,\"CC-MAIN-2020-10\":128,\"... | \n", + "
1 | \n", + "198 | \n", + "el economista [spain] | \n", + "http://www.eleconomista.es/ | \n", + "es | \n", + "438106 | \n", + "224102 | \n", + "12713758424 | \n", + "{\"cat\":45,\"eng\":179,\"eng,fra,spa\":562,\"eng,ind... | \n", + "{\"application/font-woff\":1,\"application/pdf\":1... | \n", + "{\"2020\":194215,\"2021\":243891} | \n", + "{\"CC-MAIN-2020-05\":23904,\"CC-MAIN-2020-10\":185... | \n", + "
2 | \n", + "296 | \n", + "majadahonda magazin | \n", + "https://majadahondamagazin.es/ | \n", + "es | \n", + "18211 | \n", + "10608 | \n", + "533454607 | \n", + "{\"eng,spa\":8,\"fra,spa\":1,\"spa\":17371,\"spa,cat\"... | \n", + "{\"application/pdf\":22,\"application/rss+xml\":1,... | \n", + "{\"2020\":15638,\"2021\":2573} | \n", + "{\"CC-MAIN-2020-05\":2168,\"CC-MAIN-2020-10\":2006... | \n", + "
3 | \n", + "192 | \n", + "noticias ahora | \n", + "https://www.noticias-ahora.com/ | \n", + "es | \n", + "106632 | \n", + "73046 | \n", + "2634752488 | \n", + "{\"deu,bod,nno\":1,\"eng\":5,\"eng,spa\":157,\"spa\":6... | \n", + "{\"text/html\":106632} | \n", + "{\"2020\":55385,\"2021\":51247} | \n", + "{\"CC-MAIN-2020-05\":5035,\"CC-MAIN-2020-10\":4972... | \n", + "
4 | \n", + "30 | \n", + "radiocable | \n", + "http://www.radiocable.com/ | \n", + "es | \n", + "93152 | \n", + "19256 | \n", + "1092879146 | \n", + "{\"cat\":6,\"eng\":137,\"eng,glg\":8,\"eng,glg,oci\":1... | \n", + "{\"application/rss+xml\":9,\"application/xhtml+xm... | \n", + "{\"2020\":45590,\"2021\":47562} | \n", + "{\"CC-MAIN-2020-05\":3779,\"CC-MAIN-2020-10\":4998... | \n", + "
\n", + " | id | \n", + "title | \n", + "link | \n", + "language | \n", + "captures_total | \n", + "urls_uniq_estimate | \n", + "warc_size_in_bytes | \n", + "content_languages | \n", + "content_types | \n", + "captures_per_year | \n", + "captures_per_crawl | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|
253 | \n", + "497 | \n", + "News outlet | \n", + "https://www.straitstimes.com/ | \n", + "en | \n", + "580806 | \n", + "423401 | \n", + "22705382276 | \n", + "{\"eng\":574438,\"eng,ara\":9,\"eng,ben\":1,\"eng,cat... | \n", + "{\"application/pdf\":291,\"application/rss+xml\":1... | \n", + "{\"2020\":277490,\"2021\":303316} | \n", + "{\"CC-MAIN-2020-05\":32519,\"CC-MAIN-2020-10\":272... | \n", + "
136 | \n", + "333 | \n", + "el mundo (spain) | \n", + "http://www.elmundo.es/ | \n", + "es | \n", + "499242 | \n", + "409850 | \n", + "13946919763 | \n", + "{\"cat\":5,\"cat,spa\":466,\"cat,spa,deu\":1,\"cat,sp... | \n", + "{\"application/json\":3,\"application/rss+xml\":7,... | \n", + "{\"2020\":235654,\"2021\":263588} | \n", + "{\"CC-MAIN-2020-05\":32370,\"CC-MAIN-2020-10\":288... | \n", + "
310 | \n", + "63 | \n", + "la nacion (argentina) | \n", + "http://www.lanacion.com.ar/ | \n", + "es | \n", + "496848 | \n", + "382897 | \n", + "20352476991 | \n", + "{\"eng\":42,\"eng,spa\":78,\"lat,spa\":1,\"spa\":46433... | \n", + "{\"application/rss+xml\":2,\"text/html\":496843,\"t... | \n", + "{\"2020\":218658,\"2021\":278190} | \n", + "{\"CC-MAIN-2020-05\":29451,\"CC-MAIN-2020-10\":239... | \n", + "
109 | \n", + "307 | \n", + "europa press | \n", + "https://www.europapress.es/ | \n", + "es | \n", + "492017 | \n", + "448479 | \n", + "15435990919 | \n", + "{\"cat,spa\":2548,\"cat,spa,eng\":103,\"cat,spa,grn... | \n", + "{\"application/octet-stream\":40,\"application/rs... | \n", + "{\"2020\":204451,\"2021\":287566} | \n", + "{\"CC-MAIN-2020-05\":23867,\"CC-MAIN-2020-10\":207... | \n", + "
326 | \n", + "255 | \n", + "el comercio perú | \n", + "https://elcomercio.pe/ | \n", + "es | \n", + "489875 | \n", + "361216 | \n", + "16227224839 | \n", + "{\"eng\":3,\"eng,spa\":93,\"eng,spa,cat\":2,\"que,spa... | \n", + "{\"application/xhtml+xml\":4,\"image/jpeg\":1,\"tex... | \n", + "{\"2020\":229643,\"2021\":260232} | \n", + "{\"CC-MAIN-2020-05\":22913,\"CC-MAIN-2020-10\":316... | \n", + "
\n", + " | id | \n", + "title | \n", + "link | \n", + "language | \n", + "captures_total | \n", + "urls_uniq_estimate | \n", + "warc_size_in_bytes | \n", + "content_languages | \n", + "content_types | \n", + "captures_per_year | \n", + "captures_per_crawl | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|
167 | \n", + "316 | \n", + "la jornada | \n", + "http://www.jornada.unam.mx/ultimas | \n", + "es | \n", + "1 | \n", + "1 | \n", + "45869 | \n", + "{\"spa\":1} | \n", + "{\"application/xhtml+xml\":1} | \n", + "{\"2020\":1} | \n", + "{\"CC-MAIN-2020-10\":1} | \n", + "
388 | \n", + "283 | \n", + "la prensa grafica | \n", + "http://www.laprensagrafica.com/inicio | \n", + "es | \n", + "1 | \n", + "1 | \n", + "12698 | \n", + "{\"spa\":1} | \n", + "{\"text/html\":1} | \n", + "{\"2020\":1} | \n", + "{\"CC-MAIN-2020-45\":1} | \n", + "
223 | \n", + "110 | \n", + "onemi: ministerio del interior y seguridad púb... | \n", + "http://www.onemi.cl/ | \n", + "es | \n", + "2 | \n", + "2 | \n", + "12559 | \n", + "{\"spa\":1} | \n", + "{\"text/html\":2} | \n", + "{\"2020\":1,\"2021\":1} | \n", + "{\"CC-MAIN-2020-34\":1,\"CC-MAIN-2021-17\":1} | \n", + "
67 | \n", + "326 | \n", + "la opinión de tenerife | \n", + "http://www.laopinion.es/ | \n", + "es | \n", + "5 | \n", + "1 | \n", + "7763 | \n", + "{\"spa\":5} | \n", + "{\"text/html\":5} | \n", + "{\"2020\":4,\"2021\":1} | \n", + "{\"CC-MAIN-2020-10\":1,\"CC-MAIN-2020-16\":1,\"CC-M... | \n", + "
49 | \n", + "234 | \n", + "conred | \n", + "https://conred.gob.gt/emergencia/ | \n", + "es | \n", + "6 | \n", + "1 | \n", + "103054 | \n", + "{\"spa\":6} | \n", + "{\"text/html\":6} | \n", + "{\"2021\":6} | \n", + "{\"CC-MAIN-2021-04\":1,\"CC-MAIN-2021-17\":1,\"CC-M... | \n", + "
\n", + " | 0 | \n", + "1 | \n", + "
---|---|---|
0 | \n", + "text/html | \n", + "31535762 | \n", + "
1 | \n", + "application/xhtml+xml | \n", + "3744758 | \n", + "
2 | \n", + "application/pdf | \n", + "68266 | \n", + "
3 | \n", + "text/plain | \n", + "24513 | \n", + "
4 | \n", + "application/rss+xml | \n", + "20988 | \n", + "
5 | \n", + "image/jpeg | \n", + "7541 | \n", + "
6 | \n", + "application/atom+xml | \n", + "4555 | \n", + "
7 | \n", + "application/json | \n", + "1194 | \n", + "
8 | \n", + "application/xml | \n", + "541 | \n", + "
9 | \n", + "image/png | \n", + "501 | \n", + "
\n", + " | # | \n", + "Dataset title | \n", + "Domain Name / link\\n(if highlighted in red, it's a duplicate! So don't add it...) | \n", + "License\\n(default is UNKNOWN) | \n", + "Release (Issue date) | \n", + "Glottocode | \n", + "Language(s) (or family) | \n", + "Dialect/accent (if known) | \n", + "Subject | \n", + "Format | \n", + "Collection Style (Manual Curation vs Crowdsourced(web)) ? | \n", + "What it is / why we want it (5-25 words) | \n", + "Volume (estimates) | \n", + "Contains Personal Information? (-1=unlikely, 0=neutral, 1=likely) | \n", + "Owner | \n", + "Usage and relation to other datasets | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "12 | \n", + "Fundacion Cajasol | \n", + "https://fundacioncajasol.com/ | \n", + "unknown | \n", + "multiple releases | \n", + "stan1288 | \n", + "es | \n", + "Spain | \n", + "General News | \n", + "text (web) | \n", + "manual | \n", + "NaN | \n", + "unknown | \n", + "unknown | \n", + "Fundacion Cajasol | \n", + "NaN | \n", + "
1 | \n", + "13 | \n", + "Asamblea Nacional del Ecuador | \n", + "http://www.confirmado.net/ | \n", + "unknown | \n", + "multiple releases | \n", + "NaN | \n", + "es | \n", + "Ecuador | \n", + "General News | \n", + "text (web) | \n", + "manual | \n", + "NaN | \n", + "unknown | \n", + "unknown | \n", + "Asamblea Nacional del Ecuador | \n", + "NaN | \n", + "
2 | \n", + "14 | \n", + "el periodico de Tlaxcala | \n", + "https://elperiodicodetlaxcala.com/ | \n", + "unknown | \n", + "multiple releases | \n", + "NaN | \n", + "es | \n", + "Mexico | \n", + "General News | \n", + "text (web) | \n", + "manual | \n", + "NaN | \n", + "unknown | \n", + "unknown | \n", + "el periodico de Tlaxcala | \n", + "NaN | \n", + "
3 | \n", + "15 | \n", + "mispeces | \n", + "https://www.mispeces.com/ | \n", + "unknown | \n", + "multiple releases | \n", + "NaN | \n", + "es | \n", + "Spain | \n", + "General News | \n", + "text (web) | \n", + "manual | \n", + "NaN | \n", + "unknown | \n", + "unknown | \n", + "mispeces | \n", + "NaN | \n", + "
4 | \n", + "16 | \n", + "DiarioVasco | \n", + "https://www.diariovasco.com/ | \n", + "unknown | \n", + "multiple releases | \n", + "NaN | \n", + "es | \n", + "Spain | \n", + "General News | \n", + "text (web) | \n", + "manual | \n", + "NaN | \n", + "unknown | \n", + "unknown | \n", + "DiarioVasco | \n", + "NaN | \n", + "
\n", + " | id | \n", + "title | \n", + "link | \n", + "language | \n", + "
---|---|---|---|---|
0 | \n", + "12 | \n", + "Fundacion Cajasol | \n", + "https://fundacioncajasol.com/ | \n", + "es | \n", + "
1 | \n", + "13 | \n", + "Asamblea Nacional del Ecuador | \n", + "http://www.confirmado.net/ | \n", + "es | \n", + "
2 | \n", + "14 | \n", + "el periodico de Tlaxcala | \n", + "https://elperiodicodetlaxcala.com/ | \n", + "es | \n", + "
3 | \n", + "15 | \n", + "mispeces | \n", + "https://www.mispeces.com/ | \n", + "es | \n", + "
4 | \n", + "16 | \n", + "DiarioVasco | \n", + "https://www.diariovasco.com/ | \n", + "es | \n", + "
\n", + " | url_path_prefix | \n", + "
---|---|
/ | \n", + "391 | \n", + "
\n", + " | 13 | \n", + "
/es/ | \n", + "6 | \n", + "
/wps/portal/rielcano_es | \n", + "2 | \n", + "
/forums/ | \n", + "2 | \n", + "
/web/ | \n", + "1 | \n", + "
/es | \n", + "1 | \n", + "
/singapore | \n", + "1 | \n", + "
/informa | \n", + "1 | \n", + "
/forum/ | \n", + "1 | \n", + "
/es/news | \n", + "1 | \n", + "
/mining/ | \n", + "1 | \n", + "
/community | \n", + "1 | \n", + "
/news/es | \n", + "1 | \n", + "
/pagina/bienvenidos-al-comite-de-sanidad-vegetal-cosave | \n", + "1 | \n", + "
/ultimas | \n", + "1 | \n", + "
/index.php/es | \n", + "1 | \n", + "
/spanish/ | \n", + "1 | \n", + "
/kiasu | \n", + "1 | \n", + "
/tecnopunta | \n", + "1 | \n", + "
/search/label/primera | \n", + "1 | \n", + "
/r/singapore/ | \n", + "1 | \n", + "
/blog/ | \n", + "1 | \n", + "
/site/ | \n", + "1 | \n", + "
/presidencia/ | \n", + "1 | \n", + "
/index.php/riesed | \n", + "1 | \n", + "
/index.php/es/ | \n", + "1 | \n", + "
/upotec | \n", + "1 | \n", + "
/emergencia/ | \n", + "1 | \n", + "
/portada | \n", + "1 | \n", + "
/home/es/ | \n", + "1 | \n", + "
/endirecto/ | \n", + "1 | \n", + "
/noticias-guatemala-diario-centro-america/ | \n", + "1 | \n", + "
/en/ | \n", + "1 | \n", + "
/cadiz/ | \n", + "1 | \n", + "
/news/ | \n", + "1 | \n", + "
/monplaneta/ | \n", + "1 | \n", + "
/web/sala-de-prensa | \n", + "1 | \n", + "
/noticias/ | \n", + "1 | \n", + "
/envivo/ | \n", + "1 | \n", + "
/inicio | \n", + "1 | \n", + "
/va/inicio/presentacion | \n", + "1 | \n", + "
/index.php/riceg/index | \n", + "1 | \n", + "
/portal/ | \n", + "1 | \n", + "
/noticias | \n", + "1 | \n", + "
/ww5/ | \n", + "1 | \n", + "
/alacarta/ | \n", + "1 | \n", + "
\n", + " | url_path_prefix | \n", + "
---|---|
/ | \n", + "400 | \n", + "
\n", + " | 13 | \n", + "
/es/ | \n", + "6 | \n", + "
/forums/ | \n", + "2 | \n", + "
/web/ | \n", + "1 | \n", + "
/presidencia/ | \n", + "1 | \n", + "
/es | \n", + "1 | \n", + "
/informa | \n", + "1 | \n", + "
/forum/ | \n", + "1 | \n", + "
/mining/ | \n", + "1 | \n", + "
/news/es | \n", + "1 | \n", + "
/site/ | \n", + "1 | \n", + "
/blog/ | \n", + "1 | \n", + "
/spanish/ | \n", + "1 | \n", + "
/index.php/es | \n", + "1 | \n", + "
/kiasu | \n", + "1 | \n", + "
/tecnopunta | \n", + "1 | \n", + "
/inicio | \n", + "1 | \n", + "
/r/singapore/ | \n", + "1 | \n", + "
/ultimas | \n", + "1 | \n", + "
/portal/ | \n", + "1 | \n", + "
/index.php/es/ | \n", + "1 | \n", + "
/ww5/ | \n", + "1 | \n", + "
/emergencia/ | \n", + "1 | \n", + "
/portada | \n", + "1 | \n", + "
/home/es/ | \n", + "1 | \n", + "
/es/news | \n", + "1 | \n", + "
/endirecto/ | \n", + "1 | \n", + "
/singapore | \n", + "1 | \n", + "
/community | \n", + "1 | \n", + "
/cadiz/ | \n", + "1 | \n", + "
/news/ | \n", + "1 | \n", + "
/upotec | \n", + "1 | \n", + "
/monplaneta/ | \n", + "1 | \n", + "
/noticias/ | \n", + "1 | \n", + "
/envivo/ | \n", + "1 | \n", + "
/en/ | \n", + "1 | \n", + "
/noticias | \n", + "1 | \n", + "
/alacarta/ | \n", + "1 | \n", + "
\n", + " | id | \n", + "title | \n", + "link | \n", + "language | \n", + "url_path_prefix | \n", + "url_host_name | \n", + "url_host_registered_domain | \n", + "url_surtkey | \n", + "
---|---|---|---|---|---|---|---|---|
0 | \n", + "12 | \n", + "Fundacion Cajasol | \n", + "https://fundacioncajasol.com/ | \n", + "es | \n", + "/ | \n", + "fundacioncajasol.com | \n", + "fundacioncajasol.com | \n", + "com,fundacioncajasol)/ | \n", + "
1 | \n", + "13 | \n", + "Asamblea Nacional del Ecuador | \n", + "http://www.confirmado.net/ | \n", + "es | \n", + "/ | \n", + "www.confirmado.net | \n", + "confirmado.net | \n", + "net,confirmado)/ | \n", + "
2 | \n", + "14 | \n", + "el periodico de Tlaxcala | \n", + "https://elperiodicodetlaxcala.com/ | \n", + "es | \n", + "/ | \n", + "elperiodicodetlaxcala.com | \n", + "elperiodicodetlaxcala.com | \n", + "com,elperiodicodetlaxcala)/ | \n", + "
3 | \n", + "15 | \n", + "mispeces | \n", + "https://www.mispeces.com/ | \n", + "es | \n", + "/ | \n", + "www.mispeces.com | \n", + "mispeces.com | \n", + "com,mispeces)/ | \n", + "
4 | \n", + "16 | \n", + "DiarioVasco | \n", + "https://www.diariovasco.com/ | \n", + "es | \n", + "/ | \n", + "www.diariovasco.com | \n", + "diariovasco.com | \n", + "com,diariovasco)/ | \n", + "
\n", + " | id | \n", + "title | \n", + "link | \n", + "language | \n", + "url_path_prefix | \n", + "url_host_name | \n", + "url_host_registered_domain | \n", + "url_surtkey | \n", + "
---|---|---|---|---|---|---|---|---|
60 | \n", + "72 | \n", + "elcano royal insitute (real istituto elcano) | \n", + "http://www.realinstitutoelcano.org/ | \n", + "es | \n", + "/ | \n", + "www.realinstitutoelcano.org | \n", + "realinstitutoelcano.org | \n", + "org,realinstitutoelcano)/ | \n", + "
82 | \n", + "94 | \n", + "real instituto elcano | \n", + "http://www.realinstitutoelcano.org/ | \n", + "es | \n", + "/ | \n", + "www.realinstitutoelcano.org | \n", + "realinstitutoelcano.org | \n", + "org,realinstitutoelcano)/ | \n", + "