Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for oereblex api version 1.2.5 #345

Merged
merged 5 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Changelog
=========
Change log
==========

unreleased
----------

Supported GEO-Link API versions: v1.0.0, v1.1.0, v1.1.1, v1.2.0, v1.2.1, v1.2.2, v1.2.3, v1.2.4 (default)
Supported GEO-Link API versions: v1.0.0, v1.1.0, v1.1.1, v1.2.0, v1.2.1, v1.2.2, v1.2.3, v1.2.4, v1.2.5 (default)

- Drop support for Python 3.8
- Dependency updates
Expand Down
18 changes: 17 additions & 1 deletion geolink_formatter/entity.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
""" Provides the Msg, Document and File classes. """
import datetime


Expand All @@ -12,7 +13,8 @@ class Document(object):
def __init__(self, files, id=None, category=None, doctype=None, federal_level=None, authority=None,
authority_url=None, title=None, number=None, abbreviation=None, instance=None, type=None,
subtype=None, decree_date=None, enactment_date=None, abrogation_date=None, cycle=None,
municipality=None, index=None, status=None, status_start_date=None, status_end_date=None):
municipality=None, index=None, status=None, status_start_date=None, status_end_date=None,
language_document=None, language_link=None):
"""Creates a new document instance.

Args:
Expand All @@ -38,6 +40,8 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No
status (str or None): The status of the prebublication.
status_start_date (datetime.date or None): Start date of the status.
status_end_date (datetime.date or None): End date of the status.
language_document (str or None): Language of the document.
language_link (str or None): Language of the geolink/prepublink collection.

Raises:
TypeError: Raised on missing argument or invalid argument type.
Expand Down Expand Up @@ -109,6 +113,8 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No
self._status = status
self._status_start_date = status_start_date
self._status_end_date = status_end_date
self._language_document = language_document
self._language_link = language_link

@property
def files(self):
Expand Down Expand Up @@ -220,6 +226,16 @@ def status_end_date(self):
"""datetime.date: End date of the status (since v1.2.2)."""
return self._status_end_date

@property
def language_document(self):
"""str: Language of the document (since v1.2.5)."""
return self._language_document

@property
def language_link(self):
"""str: Language of the geolink or prepublink (since v1.2.5)."""
return self._language_link


class File(object):
def __init__(self, category=None, href=None, title=None, description=None):
Expand Down
1 change: 1 addition & 0 deletions geolink_formatter/format.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
""" Provides class HTML for rendering the xml document from the geolink api as html. """
from datetime import date


Expand Down
190 changes: 119 additions & 71 deletions geolink_formatter/parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# -*- coding: utf-8 -*-
""" Provides classes for parsing the xml document from the geolink api. """
from importlib import resources as impresources

Check warning

Code scanning / Pylint (reported by Codacy)

No name 'resources' in module 'importlib' Warning

No name 'resources' in module 'importlib'
import datetime
import pkg_resources
import requests
from lxml.etree import DTD, DocumentInvalid, fromstring
from xmlschema import XMLSchema11
from geolink_formatter.entity import Document, File
from geolink_formatter.utils import filter_duplicated_documents


class SCHEMA(object):
Expand Down Expand Up @@ -32,15 +34,18 @@
"""str: geoLink schema version 1.2.3"""

V1_2_4 = '1.2.4'
"""str: geoLink schema version 1.2.3"""
"""str: geoLink schema version 1.2.4"""

V1_2_5 = '1.2.5'
"""str: geoLink schema version 1.2.5"""


class XML(object):

_date_format = '%Y-%m-%d'
"""str: Format of date values in XML."""

def __init__(self, host_url=None, version='1.2.4', dtd_validation=False, xsd_validation=True):
def __init__(self, host_url=None, version='1.2.5', dtd_validation=False, xsd_validation=True):
"""Create a new XML parser instance containing the geoLink XSD for validation.

Args:
Expand All @@ -57,10 +62,10 @@
self._version = version
self._dtd_validation = dtd_validation
self._xsd_validation = xsd_validation
xsd = pkg_resources.resource_filename('geolink_formatter', 'schema/v{0}.xsd'.format(version))
xsd = impresources.files('geolink_formatter') / 'schema' / f'v{version}.xsd'
if self._xsd_validation:
with open(xsd, encoding='utf-8') as f:
self._schema = XMLSchema11(f.read())
with xsd.open(mode='r', encoding='utf-8') as xsd_f:
self._schema = XMLSchema11(xsd_f.read())

@property
def host_url(self):
Expand Down Expand Up @@ -94,6 +99,101 @@
raise DocumentInvalid('Missing DTD in parsed content')
return content

def _process_single_document(self, document_el, language_link):
"""
Processes a single document element.

Args:
document_el (lxml.etree._Element): element 'document'
language_link (str): language of the documents set

Returns:
geolink_formatter.entity.Document: document

"""
doc_id = document_el.attrib.get('id')
doctype = document_el.attrib.get('doctype')

# Mangle doc_id for notices. While IDs are unique between decrees
# and edicts, this is not the case when adding notices to the mix.
if doctype == 'notice':
doc_id += doctype

files = []
for file_el in document_el.iter('file'):
href = file_el.attrib.get('href')
if self.host_url and not href.startswith('http://') and not href.startswith('https://'):
href = f'{self.host_url}{href}'
files.append(File(
title=file_el.attrib.get('title'),
description=file_el.attrib.get('description'),
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed
href=href,
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed
category=file_el.attrib.get('category')
))
enactment_date = document_el.attrib.get('enactment_date')
if enactment_date:
enactment_date = datetime.datetime.strptime(enactment_date, self._date_format).date()
decree_date = document_el.attrib.get('decree_date')
if decree_date:
decree_date = datetime.datetime.strptime(decree_date, self._date_format).date()
abrogation_date = document_el.attrib.get('abrogation_date')
if abrogation_date:
abrogation_date = datetime.datetime.strptime(abrogation_date, self._date_format).date()
status_start_date = document_el.attrib.get('status_start_date')
if status_start_date:
status_start_date = datetime.datetime.strptime(status_start_date, self._date_format)\
.date()
status_end_date = document_el.attrib.get('status_end_date')
if status_end_date:
status_end_date = datetime.datetime\
.strptime(status_end_date, self._date_format).date()

document = Document(
files=files,
id=doc_id,
category=document_el.attrib.get('category'),
doctype=document_el.attrib.get('doctype'),
federal_level=document_el.attrib.get('federal_level'),
authority=document_el.attrib.get('authority'),
authority_url=document_el.attrib.get('authority_url'),
title=document_el.attrib.get('title'),
number=document_el.attrib.get('number'),
abbreviation=document_el.attrib.get('abbreviation'),
instance=document_el.attrib.get('instance'),
type=document_el.attrib.get('type'),
subtype=document_el.attrib.get('subtype'),
decree_date=decree_date,
enactment_date=enactment_date,
abrogation_date=abrogation_date,
cycle=document_el.attrib.get('cycle'),
municipality=document_el.attrib.get('municipality'),
index=document_el.attrib.get('index'),
status=document_el.attrib.get('status'),
status_start_date=status_start_date,
status_end_date=status_end_date,
language_document=document_el.attrib.get('language'),
language_link=language_link
)

return document

def _process_geolinks_prepublinks(self, geolink_prepublink_el):
"""
Processes a 'geolinks' or 'prepublinks' element.

Args:
geolink_prepublink_el (lxml.etree._Element): element 'geolinks' or 'prepublinks'

Return:
list[geolink_formatter.entity.Document]: list of documents
"""
language_link = geolink_prepublink_el.get('language')

documents = []
for document_el in geolink_prepublink_el.iter('document'):
documents.append(self._process_single_document(document_el, language_link))
return documents

def from_string(self, xml):
"""Parses XML into internal structure.

Expand All @@ -109,71 +209,19 @@
lxml.etree.XMLSyntaxError: Raised on failed validation.
"""
root = self._parse_xml(xml)
documents = list()

for document_el in root.iter('document'):
doc_id = document_el.attrib.get('id')
doctype = document_el.attrib.get('doctype')

# Mangle doc_id for notices. While IDs are unique between decrees
# and edicts, this is not the case when adding notices to the mix.
if doctype == 'notice':
doc_id += doctype

if doc_id and doc_id not in [doc.id for doc in documents]:
files = list()
for file_el in document_el.iter('file'):
href = file_el.attrib.get('href')
if self.host_url and not href.startswith(u'http://') and not href.startswith(u'https://'):
href = u'{host}{href}'.format(host=self.host_url, href=href)
files.append(File(
title=file_el.attrib.get('title'),
description=file_el.attrib.get('description'),
href=href,
category=file_el.attrib.get('category')
))
enactment_date = document_el.attrib.get('enactment_date')
if enactment_date:
enactment_date = datetime.datetime.strptime(enactment_date, self._date_format).date()
decree_date = document_el.attrib.get('decree_date')
if decree_date:
decree_date = datetime.datetime.strptime(decree_date, self._date_format).date()
abrogation_date = document_el.attrib.get('abrogation_date')
if abrogation_date:
abrogation_date = datetime.datetime.strptime(abrogation_date, self._date_format).date()
status_start_date = document_el.attrib.get('status_start_date')
if status_start_date:
status_start_date = datetime.datetime.strptime(status_start_date, self._date_format)\
.date()
status_end_date = document_el.attrib.get('status_end_date')
if status_end_date:
status_end_date = datetime.datetime.strptime(status_end_date, self._date_format)\
.date()

documents.append(Document(
files=files,
id=doc_id,
category=document_el.attrib.get('category'),
doctype=document_el.attrib.get('doctype'),
federal_level=document_el.attrib.get('federal_level'),
authority=document_el.attrib.get('authority'),
authority_url=document_el.attrib.get('authority_url'),
title=document_el.attrib.get('title'),
number=document_el.attrib.get('number'),
abbreviation=document_el.attrib.get('abbreviation'),
instance=document_el.attrib.get('instance'),
type=document_el.attrib.get('type'),
subtype=document_el.attrib.get('subtype'),
decree_date=decree_date,
enactment_date=enactment_date,
abrogation_date=abrogation_date,
cycle=document_el.attrib.get('cycle'),
municipality=document_el.attrib.get('municipality'),
index=document_el.attrib.get('index'),
status=document_el.attrib.get('status'),
status_start_date=status_start_date,
status_end_date=status_end_date
))
documents = []

# evaluate root element's tag
if root.tag in ['multilang_geolinks', 'multilang_prepublinks']:
for elem in root.iter('geolinks', 'prepublinks'):
documents.extend(self._process_geolinks_prepublinks(elem))
elif root.tag in ['geolinks', 'prepublinks']:
documents.extend(self._process_geolinks_prepublinks(root))
else:
raise RuntimeError(f'Unexpected tag name: {root.tag}')

# filter documents (remove duplicates)
documents = filter_duplicated_documents(documents)

return documents

Expand Down
Loading
Loading