Skip to content

Commit

Permalink
Merge pull request RockefellerArchiveCenter#510 from RockefellerArchi…
Browse files Browse the repository at this point in the history
…veCenter/issue-507

Strip XML and HTML tags from note text and title fields
  • Loading branch information
helrond authored Jun 6, 2022
2 parents 41e05e7 + a96a465 commit 168d0aa
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 12 deletions.
35 changes: 24 additions & 11 deletions transformer/mappings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import json
import re
import xml.etree.ElementTree as ET

import odin
import requests
Expand Down Expand Up @@ -51,9 +53,15 @@ def has_online_instance(instances, uri):
return False


def replace_xml(content_list):
"""Replaces XML entities in notes with HTML tags."""
return [c.replace("extref", "a") for c in content_list]
def strip_tags(user_string):
"""Strips XML and HTML tags from a string."""
try:
xmldoc = ET.fromstring(f'<xml>{user_string}</xml>')
textcontent = ''.join(xmldoc.itertext())
except ET.ParseError:
tagregxp = re.compile(r'<[/\w][^>]+>')
textcontent = tagregxp.sub('', user_string)
return textcontent


def transform_language(value, lang_materials):
Expand Down Expand Up @@ -123,7 +131,7 @@ class SourceAncestorToRecordReference(odin.Mapping):

@odin.map_field(from_field="title", to_field="title")
def title(self, value):
return value.strip()
return strip_tags(value.strip())

@odin.map_field(from_field="order", to_field="order")
def order(self, value):
Expand Down Expand Up @@ -159,7 +167,7 @@ def type(self, value):

@odin.map_field(from_field="title", to_field="title")
def title(self, value):
return value.strip()
return strip_tags(value.strip())

@odin.map_list_field(from_field="ref", to_field="external_identifiers", to_list=True)
def external_identifiers(self, value):
Expand Down Expand Up @@ -297,8 +305,8 @@ def map_subnotes(self, value):
subnote = self.chronology_subnotes(value.items)
else:
subnote = Subnote(
type="text", content=replace_xml(value.content)
if isinstance(value.content, list) else replace_xml([value.content]))
type="text", content=[strip_tags(c) for c in value.content]
if isinstance(value.content, list) else [strip_tags(value.content)])
return subnote

@odin.map_list_field(from_field="subnotes", to_field="subnotes", to_list=True)
Expand All @@ -309,7 +317,7 @@ def subnotes(self, value):
elif self.source.jsonmodel_type in ["note_singlepart"]:
# Here content is a list passed as a string, so we have to reconvert.
content = [self.source.content.strip("][\"\'")]
subnotes = [Subnote(type="text", content=replace_xml(content))]
subnotes = [Subnote(type="text", content=[strip_tags(c) for c in content])]
elif self.source.jsonmodel_type == "note_index":
subnotes = self.index_subnotes(self.source.content, self.source.items)
elif self.source.jsonmodel_type == "note_bibliography":
Expand All @@ -322,7 +330,7 @@ def bibliograpy_subnotes(self, raw_content, items):
data = []
# Here content is a list passed as a string, so we have to reconvert.
content = [raw_content.strip("][\'")]
data.append(Subnote(type="text", content=replace_xml(content)))
data.append(Subnote(type="text", content=[strip_tags(c) for c in content]))
data.append(Subnote(type="orderedlist", content=items))
return data

Expand All @@ -343,6 +351,10 @@ class SourceResourceToCollection(odin.Mapping):
from_obj = SourceResource
to_obj = Collection

@odin.map_field(from_field="title", to_field="title")
def title(self, value):
return strip_tags(value)

@odin.map_list_field(from_field="notes", to_field="notes", to_list=True)
def notes(self, value):
return SourceNoteToNote.apply([v for v in value if (v.publish and v.type in NOTE_TYPE_CHOICES_TRANSFORM)])
Expand Down Expand Up @@ -410,7 +422,7 @@ def title(self, value):
title = value.strip() if value else self.source.display_string.strip()
if getattr(self.source, "component_id", None):
title = "{}, {} {}".format(title, self.source.level.capitalize(), self.source.component_id)
return title
return strip_tags(title)

@odin.map_field(from_field="language", to_field="languages", to_list=True)
def languages(self, value):
Expand Down Expand Up @@ -476,7 +488,8 @@ def dates(self, value):

@odin.map_field
def title(self, value):
return value.strip() if value else self.source.display_string.strip()
title = value.strip() if value else self.source.display_string.strip()
return strip_tags(title)

@odin.map_field(from_field="language", to_field="languages", to_list=True)
def languages(self, value):
Expand Down
6 changes: 5 additions & 1 deletion transformer/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from fetcher.helpers import identifier_from_uri

from .cron import CheckMissingOnlineAssets
from .mappings import has_online_instance
from .mappings import has_online_instance, strip_tags
from .models import DataObject
from .resources.configs import NOTE_TYPE_CHOICES_TRANSFORM
from .transformers import Transformer
Expand Down Expand Up @@ -255,3 +255,7 @@ def test_transformer(self):
def test_ping(self):
response = self.client.get(reverse('ping'))
self.assertEqual(response.status_code, 200)

def test_strip_tags(self):
for input in ["<title>a collection</title>", "a <a href='https://example.com'>collection</a>", "a collection"]:
self.assertEqual('a collection', strip_tags(input))

0 comments on commit 168d0aa

Please sign in to comment.