diff --git a/dataimporter/__init__.py b/dataimporter/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dataimporter/emu/__init__.py b/dataimporter/emu/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dataimporter/emu/dumps.py b/dataimporter/emu/dumps.py
new file mode 100644
index 0000000..893f125
--- /dev/null
+++ b/dataimporter/emu/dumps.py
@@ -0,0 +1,263 @@
+import gzip
+import itertools
+import re
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from enum import Enum
+from functools import cached_property, total_ordering
+from pathlib import Path
+from typing import Dict, Union, Tuple, Iterable, Any
+from typing import List
+
+from splitgill.utils import to_timestamp, parse_to_timestamp
+
+EMU_ID_FIELD = "irn"
+# this is arbitrary-ish but it's the time of the first good full dumps we have
+FIRST_VERSION = to_timestamp(datetime(2017, 8, 30))
+
+
+@dataclass
+class EMuRecord:
+    """
+    A class representing an EMu record.
+    """
+
+    id: int
+    version: int
+    data: Dict[str, Union[str, Tuple[str]]]
+
+    @property
+    def is_deleted(self) -> bool:
+        """
+        Returns True if the record's data represents a deletion, False if not.
+
+        :return: True if the record's data represents a deletion, False if not.
+        """
+        return not bool(self.data)
+
+    def __contains__(self, field: str) -> bool:
+        """
+        Checks if the given field is present in this record's data.
+
+        :param field: the field name
+        :return: True if the field exists, False if not
+        """
+        return field in self.data
+
+
+@total_ordering
+class EMuTable(Enum):
+    """
+    Enumeration of the EMu tables we currently handle.
+
+    The value of the enum indicates the order they should be ingested in with EAudit
+    first and then the others after.
+    """
+
+    eaudit = 0
+    ecatalogue = 1
+    emultimedia = 2
+    etaxonomy = 3
+
+    def __lt__(self, other):
+        # implemented for the total_ordering annotation on the class and allow the
+        # values to be used to prioritise the ingest of the tables
+        if isinstance(other, EMuTable):
+            return self.value < other.value
+        return NotImplemented
+
+    @property
+    def is_stored(self) -> bool:
+        """
+        Whether the table's data should be stored or not.
+
+        Currently, only EAudit is ignored as it is actually providing information about
+        the other tables (like deletes).
+        """
+        return self != EMuTable.eaudit
+
+
+def find_emu_dumps(root: Path, after: int = FIRST_VERSION) -> List["EMuDump"]:
+    """
+    Find all the EMu dumps in the given path and return them as a list of EMuDump
+    objects. The list returned will be sorted in the order that the dumps should be
+    processed.
+
+    :param root: the root directory
+    :param after: only dumps on or after this version will be returned, defaults to the
+                  first full EMu dump from 30/08/17 (see FIRST_VERSION at the module
+                  level)
+    :return: a sorted list of EMuDump objects
+    """
+    dumps = []
+    dump_matcher = re.compile(
+        r"(?P<table>\w+)\.(?:deleted-)?export\.(?P<date>[0-9]{8})\.gz"
+    )
+
+    for path in root.iterdir():
+        match = dump_matcher.match(path.name)
+        if match:
+            table_name, date = match.groups()
+            try:
+                table = EMuTable[table_name]
+            except KeyError as e:
+                # ignore as we don't deal with this table
+                continue
+
+            if table is EMuTable.eaudit:
+                dump = EMuAuditDump(path, table, date)
+            else:
+                dump = EMuDump(path, table, date)
+
+            if dump.version >= after:
+                dumps.append(dump)
+
+    return sorted(dumps)
+
+
+class EMuDump:
+    """
+    Class representing an EMu export (or "dump") texexport file.
+
+    Each file represents data from a single table.
+    """
+
+    def __init__(self, path: Path, table: EMuTable, date: str):
+        """
+        :param path: the full path to the dump file
+        :param table: the table the dump file is from
+        :param date: the date string of the export
+        """
+        self.path = path
+        self.table = table
+        self.date = date
+        # convert the date into a version timestamp
+        self.version = parse_to_timestamp(date, "%Y%m%d", tzinfo=timezone.utc)
+
+    @property
+    def is_audit(self):
+        """
+        Is this an audit dump?
+
+        :return: True if it is, False if not.
+        """
+        return self.table == EMuTable.eaudit
+
+    def __str__(self) -> str:
+        return f"Dump {self.table}@{self.version}/{self.date} [{self.path}]"
+
+    def __eq__(self, other: Any):
+        if isinstance(other, EMuDump):
+            return self.version == other.version and self.table == other.table
+        return NotImplemented
+
+    def __lt__(self, other: Any):
+        if isinstance(other, EMuDump):
+            # order by version, then table. The main goal here is to ensure the versions
+            # are ordered correctly and the audit dumps are ordered before normal tables
+            # as we need to do deletes first
+            return (self.version, self.table) < (other.version, other.table)
+
+        return NotImplemented
+
+    @property
+    def size(self) -> int:
+        """
+        Returns the size of the dump in bytes. This is the size of the compressed dump
+        file.
+
+        :return: the file size of the dump
+        """
+        return self.path.stat().st_size
+
+    @cached_property
+    def count(self) -> int:
+        """
+        Returns the number of records in the dump by counting the number of IRNs we
+        find. This requires reading the whole dump.
+
+        :return: the number of records in the dump
+        """
+        irn_field_prefix = f"{EMU_ID_FIELD}:1="
+        with gzip.open(self.path, "rt", encoding="utf-8") as f:
+            return sum(1 for line in f if line.lstrip().startswith(irn_field_prefix))
+
+    def __iter__(self) -> Iterable[EMuRecord]:
+        """
+        Reads the dump file and yield an EMuRecord object per record found in the dump.
+        If a record read from the dump doesn't have a detectable IRN then no record is
+        yielded and that record is skipped.
+
+        :return: yields EMuRecord objects
+        """
+        # cache this, so we don't have to look it up everytime we want to use it (for
+        # performance)
+        version = self.version
+
+        with gzip.open(self.path, "rt", encoding="utf-8") as f:
+            # state variables for each record
+            emu_id = None
+            data = {}
+
+            # each record is delimited in the EMu dump using a line with just ### on it.
+            # This chain here ensures that the file ends with a ### line even if one
+            # isn't in the file, thus forcing the record to be yielded if it's valid.
+            # Note that I've never seen a file not end with ### in the real world, but
+            # anything's possible with EMu!
+            for line in itertools.chain(f, ["###"]):
+                line = line.strip()
+                if not line:
+                    continue
+
+                if line != "###":
+                    # the format is <field>:<index>=<value>
+                    field, value = line.split("=", 1)
+                    field = field.split(":", 1)[0]
+
+                    if field == EMU_ID_FIELD:
+                        emu_id = int(value)
+
+                    existing = data.get(field)
+                    if existing is None:
+                        # the value isn't in the data dict, add it
+                        data[field] = value
+                    else:
+                        if isinstance(existing, tuple):
+                            # there is an existing set of values in the data dict, add
+                            # the new value in a new tuple
+                            data[field] = (*existing, value)
+                        else:
+                            # there is an existing value (just one) in the data dict,
+                            # add the new value in a new tuple
+                            data[field] = (existing, value)
+                else:
+                    if emu_id is not None:
+                        yield EMuRecord(emu_id, version, data)
+
+                    # refresh our record state holding variables
+                    emu_id = None
+                    data = {}
+
+
+class EMuAuditDump(EMuDump):
+    """
+    Class representing an EMu audit table export (or "dump") texexport file.
+
+    Each file represents data from the EAudit table which accounts for changes to any
+    table in EMu. We specifically filter the audit table dumps for deletions, this is
+    achieved through an overriden __iter__ method.
+    """
+
+    def __iter__(self) -> Iterable[EMuRecord]:
+        def record_filter(record: EMuRecord):
+            # filter the dump's records so that only valid deletions are yielded
+            return (
+                # we only want delete operations
+                record.data.get("AudOperation") == "delete"
+                # AudKey is the irn of the deleted record, so it must have this field
+                and "AudKey" in record.data
+                # and this is the table the record was deleted from
+                and "AudTable" in record.data
+            )
+
+        yield from filter(record_filter, super().__iter__())
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/emu/__init__.py b/tests/emu/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/emu/test_dumps.py b/tests/emu/test_dumps.py
new file mode 100644
index 0000000..90bf949
--- /dev/null
+++ b/tests/emu/test_dumps.py
@@ -0,0 +1,241 @@
+import gzip
+from datetime import datetime
+from pathlib import Path
+from unittest.mock import MagicMock
+
+from splitgill.utils import to_timestamp
+
+from dataimporter.emu.dumps import (
+    EMuTable,
+    find_emu_dumps,
+    EMuDump,
+    EMuAuditDump,
+    EMU_ID_FIELD,
+    EMuRecord,
+)
+from tests.helpers.dumps import create_dump
+
+
+class TestEMuRecord:
+    def test_is_deleted(self):
+        assert EMuRecord(1, 2, {}).is_deleted
+        assert not EMuRecord(1, 2, {"a": "4"}).is_deleted
+
+    def test_contains(self):
+        record = EMuRecord(1, 2, {"a": "4", "b": "26"})
+        assert "a" in record
+        assert "b" in record
+        assert "c" not in record
+
+
+class TestEMuTable:
+    def test_ordering(self):
+        tables = sorted(EMuTable)
+        assert tables[0] == EMuTable.eaudit
+
+    def test_is_stored(self):
+        for table in EMuTable:
+            if table == EMuTable.eaudit:
+                assert not table.is_stored
+            else:
+                assert table.is_stored
+
+
+class TestFindEMuDumps:
+    def test_no_files(self, tmp_path: Path):
+        assert not find_emu_dumps(tmp_path)
+
+    def test_after_works(self, tmp_path: Path):
+        after = to_timestamp(datetime(2020, 3, 15))
+
+        for day in range(12, 18):
+            create_dump(tmp_path, EMuTable.ecatalogue, datetime(2020, 3, day))
+
+        dumps = find_emu_dumps(tmp_path, after=after)
+        assert len(dumps) == 3
+
+    def test_skip_invalid(self, tmp_path: Path):
+        create_dump(tmp_path, EMuTable.ecatalogue, datetime(2020, 3, 1))
+        create_dump(tmp_path, "invalid", datetime(2020, 3, 2))
+
+        dumps = find_emu_dumps(tmp_path)
+        assert len(dumps) == 1
+
+    def test_audit_dumps(self, tmp_path: Path):
+        path_1 = create_dump(tmp_path, EMuTable.eaudit, datetime(2020, 2, 1))
+        path_2 = create_dump(tmp_path, EMuTable.ecatalogue, datetime(2020, 3, 1))
+
+        dumps = find_emu_dumps(tmp_path)
+
+        assert len(dumps) == 2
+        assert dumps[0] == EMuAuditDump(path_1, EMuTable.eaudit, "20200201")
+        assert dumps[1] == EMuDump(path_2, EMuTable.ecatalogue, "20200301")
+        assert isinstance(dumps[0], EMuAuditDump)
+        assert isinstance(dumps[0], EMuDump)
+
+    def test_order(self, tmp_path: Path):
+        path_4 = create_dump(tmp_path, EMuTable.etaxonomy, datetime(2020, 2, 4))
+        path_2 = create_dump(tmp_path, EMuTable.ecatalogue, datetime(2020, 2, 1))
+        path_1 = create_dump(tmp_path, EMuTable.eaudit, datetime(2020, 2, 1))
+        path_5 = create_dump(tmp_path, EMuTable.emultimedia, datetime(2020, 3, 4))
+        path_3 = create_dump(tmp_path, EMuTable.eaudit, datetime(2020, 2, 4))
+
+        dumps = find_emu_dumps(tmp_path)
+
+        assert dumps == [
+            EMuAuditDump(path_1, EMuTable.eaudit, "20200201"),
+            EMuAuditDump(path_2, EMuTable.ecatalogue, "20200201"),
+            EMuAuditDump(path_3, EMuTable.eaudit, "20200204"),
+            EMuAuditDump(path_4, EMuTable.etaxonomy, "20200204"),
+            EMuAuditDump(path_5, EMuTable.emultimedia, "20200304"),
+        ]
+
+
+class TestEMuDump:
+    def test_version_parsing(self):
+        dump = EMuDump(MagicMock(), MagicMock(), "2020720")
+        assert dump.version == to_timestamp(datetime(2020, 7, 20))
+
+    def test_is_audit(self):
+        audit_dump = EMuDump(MagicMock(), EMuTable.eaudit, "2020720")
+        not_audit_dump = EMuDump(MagicMock(), EMuTable.ecatalogue, "2020720")
+
+        assert audit_dump.is_audit
+        assert not not_audit_dump.is_audit
+
+    def test_eq(self):
+        # eq should only care about the version and the table
+        assert EMuDump(MagicMock(), EMuTable.eaudit, "2020720") == EMuDump(
+            MagicMock(), EMuTable.eaudit, "2020720"
+        )
+        assert not EMuDump(MagicMock(), EMuTable.ecatalogue, "2020720") == EMuDump(
+            MagicMock(), EMuTable.eaudit, "2020720"
+        )
+        assert not EMuDump(MagicMock(), EMuTable.emultimedia, "2020720") == EMuDump(
+            MagicMock(), EMuTable.emultimedia, "2020721"
+        )
+
+    def test_lt(self):
+        a = EMuDump(MagicMock(), EMuTable.ecatalogue, "2020720")
+        b = EMuDump(MagicMock(), EMuTable.ecatalogue, "2020721")
+        assert a < b
+
+        a = EMuDump(MagicMock(), EMuTable.ecatalogue, "2020720")
+        b = EMuDump(MagicMock(), EMuTable.emultimedia, "2020720")
+        assert a < b
+
+        a = EMuDump(MagicMock(), EMuTable.emultimedia, "2020720")
+        b = EMuDump(MagicMock(), EMuTable.eaudit, "2020720")
+        assert b < a
+
+    def test_size(self, tmp_path: Path):
+        path = create_dump(tmp_path, EMuTable.etaxonomy, datetime(2020, 2, 4))
+
+        dump = EMuDump(path, EMuTable.etaxonomy, "20200204")
+
+        assert path.stat().st_size == dump.size
+
+    def test_count(self, tmp_path: Path):
+        records = [{"x": i} for i in range(10)]
+        path = create_dump(tmp_path, EMuTable.etaxonomy, datetime(2020, 2, 4), *records)
+        dump = EMuDump(path, EMuTable.etaxonomy, "20200204")
+        assert dump.count == 10
+
+    def test_count_empty(self, tmp_path: Path):
+        path = create_dump(tmp_path, EMuTable.etaxonomy, datetime(2020, 2, 4))
+        dump = EMuDump(path, EMuTable.etaxonomy, "20200204")
+        assert dump.count == 0
+
+    def test_iter(self, tmp_path: Path):
+        records = [{"x": str(i)} for i in range(10)]
+        path = create_dump(tmp_path, EMuTable.etaxonomy, datetime(2020, 2, 4), *records)
+        dump = EMuDump(path, EMuTable.etaxonomy, "20200204")
+
+        read_records = list(dump)
+
+        assert len(read_records) == len(records)
+        assert read_records == [
+            EMuRecord(
+                i, dump.version, {"rownum": str(i), EMU_ID_FIELD: str(i), **record}
+            )
+            for i, record in enumerate(records, start=1)
+        ]
+
+    def test_iter_empty(self, tmp_path: Path):
+        path = create_dump(tmp_path, EMuTable.etaxonomy, datetime(2020, 2, 4))
+        dump = EMuDump(path, EMuTable.etaxonomy, "20200204")
+
+        assert not list(dump)
+
+    def test_iter_missing_irn(self, tmp_path: Path):
+        path = tmp_path / "a_bad_dump.gz"
+
+        with gzip.open(path, "wt", encoding="utf-8") as f:
+            # don't write an irn
+            f.writelines([f"rownum=1\n", f"x:1=beans\n", "###\n"])
+
+        dump = EMuDump(path, EMuTable.etaxonomy, "20200204")
+
+        assert not list(dump)
+
+    def test_iter_multiple_values(self, tmp_path: Path):
+        records = [{"x": (str(i), str(i + 1), str(i + 5))} for i in range(10)]
+        path = create_dump(tmp_path, EMuTable.etaxonomy, datetime(2020, 2, 4), *records)
+        dump = EMuDump(path, EMuTable.etaxonomy, "20200204")
+
+        read_records = list(dump)
+
+        assert len(read_records) == len(records)
+        assert read_records == [
+            EMuRecord(
+                i, dump.version, {"rownum": str(i), EMU_ID_FIELD: str(i), **record}
+            )
+            for i, record in enumerate(records, start=1)
+        ]
+
+    def test_iter_blank_lines_and_no_delimiter_end(self, tmp_path: Path):
+        records = [{"x": (str(i), str(i + 1), str(i + 5))} for i in range(10)]
+        path = create_dump(tmp_path, EMuTable.etaxonomy, datetime(2020, 2, 4), *records)
+        with gzip.open(path, "at") as f:
+            # add a couple of new lines and don't add a ### at the end either
+            f.write("\n\n")
+
+        dump = EMuDump(path, EMuTable.etaxonomy, "20200204")
+        read_records = list(dump)
+        assert len(read_records) == len(records)
+
+
+class TestEMuAuditDump:
+    def test_iter(self, tmp_path: Path):
+        records = [
+            {"AudOperation": "delete", "AudKey": str(i), "AudTable": "ecatalogue"}
+            for i in range(10)
+        ]
+        # add a record we should ignore
+        records.append(
+            {
+                "AudOperation": "not a delete!",
+                "AudKey": "100",
+                "AudTable": "ecatalogue",
+            }
+        )
+        # add a delete on a table we don't deal with
+        records.append(
+            {
+                "AudOperation": "delete",
+                "AudKey": "101",
+                "AudTable": "not an emu table ever",
+            }
+        )
+
+        path = create_dump(tmp_path, EMuTable.eaudit, datetime(2020, 1, 4), *records)
+        dump = EMuAuditDump(path, EMuTable.eaudit, "20200104")
+
+        read_records = list(dump)
+        assert len(read_records) == 11
+        # check they are all deletes
+        assert all(record.data["AudOperation"] == "delete" for record in read_records)
+        assert all(
+            record.data["AudTable"] == "ecatalogue" for record in read_records[:-1]
+        )
+        assert read_records[-1].data["AudTable"] == "not an emu table ever"
diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/helpers/dumps.py b/tests/helpers/dumps.py
new file mode 100644
index 0000000..3bc0bcf
--- /dev/null
+++ b/tests/helpers/dumps.py
@@ -0,0 +1,59 @@
+import gzip
+from datetime import datetime
+from pathlib import Path
+from typing import Union
+
+from dataimporter.emu.dumps import EMuTable, EMU_ID_FIELD
+
+
+def create_dump(
+    root: Path, table: Union[str, EMuTable], date: datetime, *records: dict
+) -> Path:
+    """
+    Creates an EMu dump using the given parameters to form the path in the root, and
+    then adding the records. If no records are provided, a valid dump is still
+    generated, it will just include no records.
+
+    :param root: the directory to put the dump in
+    :param table: the EMu table being dumped, doesn't need to be valid (hence the
+                  str|EMuTable type)
+    :param date: the date of the dump
+    :param records: 0+ records as dicts
+    :return: the path of the created dump
+    """
+    export_part = "export"
+    if isinstance(table, EMuTable):
+        # eaudit dumps have a slightly different name format to normal dumps
+        if table == EMuTable.eaudit:
+            export_part = "deleted-export"
+        table = table.name
+
+    # form the path
+    dump = root / f"{table}.{export_part}.{date.strftime('%Y%m%d')}.gz"
+
+    with gzip.open(dump, "wt", encoding="utf-8") as f:
+        for row, record in enumerate(records, start=1):
+            # create rownum and irn values for the record if it doesn't have them
+            row = record.get("rownum", row)
+            irn = record.get(EMU_ID_FIELD, row)
+            f.writelines(
+                [
+                    f"rownum={row}\n",
+                    f"{EMU_ID_FIELD}:1={irn}\n",
+                ]
+            )
+
+            # write the other values
+            for key, value in record.items():
+                # we've done these, ignore if found
+                if key == EMU_ID_FIELD or key == "rownum":
+                    continue
+
+                if not isinstance(value, (tuple, list)):
+                    value = [value]
+
+                f.writelines([f"{key}:{i}={v}\n" for i, v in enumerate(value)])
+
+            f.write("###\n")
+
+    return dump