Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fs insert one new #166

Open
wants to merge 16 commits into
base: client
Choose a base branch
from
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,7 @@ dmypy.json
# Passwords for Mongo
pydatarecognition/secret_password.yml
pydatarecognition/secret_password2.yml
testing-cif-datarec-secret.json
testing-cif-datarec-secret.json

# pre-commit-hooks
.pre-commit-config.yaml
23 changes: 23 additions & 0 deletions news/fs_insert_one.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
**Added:**

* function for inserting one document to the filesystem database

**Changed:**

* <news item>

**Deprecated:**

* <news item>

**Removed:**

* <news item>

**Fixed:**

* <news item>

**Security:**

* <news item>
7 changes: 4 additions & 3 deletions pydatarecognition/cif_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ def powdercif_to_json(po):

return json_object


def json_dump(json_object, output_path):
with open(output_path, 'w') as f:
json.dump(json_object, f)
Expand Down Expand Up @@ -342,6 +343,7 @@ def terminal_print(rank_doi_score_txt):
print('-' * 81)
return None


def print_story(user_input, args, ciffiles, skipped_cifs):
frame_dashchars = '-'*80
print(f'{frame_dashchars}\nInput data file: {user_input.name}\n'
Expand All @@ -354,7 +356,6 @@ def print_story(user_input, args, ciffiles, skipped_cifs):
print(f" {cif[0]} because {cif[1]}")
print(f'Done working with cifs.\n{frame_dashchars}\nGetting references...')


if __name__=="__main__":
import pathlib
toubling_path = pathlib.Path(os.path.join(os.pardir, 'docs/examples/cifs/measured/ps5069IIIsup4.rtv.combined.cif'))
json_dump(powdercif_to_json(cif_read(toubling_path)), pathlib.Path('../test1.json'))
pass
3 changes: 2 additions & 1 deletion pydatarecognition/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ def open_dbs(rc, dbs=None):
if k in chained_db[base]:
chained_db[base][k].maps.append(v)
else:
chained_db[base][k] = ChainDB(v)
# chained_db[base][k] = ChainDB(v)
pass
client.chained_db = chained_db
return client

Expand Down
3 changes: 0 additions & 3 deletions pydatarecognition/fsclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,6 @@ def load_yaml(self, db, dbpath):
for f in [
file
for file in iglob(os.path.join(dbpath, "*.y*ml"))
if file not in db["blacklist"]
and len(db["whitelist"]) == 0
or os.path.basename(file).split(".")[0] in db["whitelist"]
]:
collfilename = os.path.split(f)[-1]
base, ext = os.path.splitext(collfilename)
Expand Down
5 changes: 3 additions & 2 deletions pydatarecognition/runcontrol.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,5 +306,6 @@ def connect_db(rc, colls=None):
'''
with connect(rc, dbs=colls) as rc.client:
dbs = rc.client.dbs
chained_db = rc.client.chained_db
return chained_db, dbs
# chained_db = rc.client.chained_db
# return chained_db, dbs
return dbs
12 changes: 12 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from xonsh.lib.os import rmtree
from pydatarecognition.powdercif import storage, BUCKET_NAME
from pydatarecognition.fsclient import dump_yaml
from pydatarecognition.runcontrol import DEFAULT_RC
from tests.inputs.pydr_rc import pydr_rc
from tests.inputs.exemplars import EXEMPLARS
from google.cloud.exceptions import Conflict
from copy import deepcopy
Expand All @@ -22,6 +24,16 @@
CIFJSON_COLLECTION_NAME = "cif_json"


@pytest.fixture(scope="session")
def rc(make_db):
rc = DEFAULT_RC
db_path = make_db
pydr_rc['databases'][0]['url'] = db_path
rc._update(pydr_rc)

return rc


@pytest.fixture(scope="function")
def cif_mongodb_client_populated():
yield from cif_mongodb_client(True)
Expand Down
12 changes: 12 additions & 0 deletions tests/inputs/pydr_rc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
pydr_rc = {
"groupname": "Billinge Group",
"databases": [
{
"name": "local",
"url": ".",
"public": False,
"path": "db",
"local": True
}
]
}
78 changes: 54 additions & 24 deletions tests/test_fsclient.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
from collections import defaultdict
from pathlib import Path
from testfixtures import TempDirectory

import pytest
import os

from pydatarecognition.fsclient import FileSystemClient
from pydatarecognition.runcontrol import connect_db
from tests.inputs.pydr_rc import pydr_rc
from tests.inputs.exemplars import EXEMPLARS

#
# def test_dump_json():
Expand All @@ -18,48 +24,40 @@
# actual = f.read()
# assert actual == json_doc

# todo:
# build a runcontrol object as in regolith. have it created globally in the
# tests for reuse in all the tests (look for DEFAULT_RC in regoith tests)
# for now:
# DEFAULT_RC = RunControl(
# _validators=DEFAULT_VALIDATORS,
# builddir="_build",
# mongodbpath=property(lambda self: os.path.join(self.builddir, "_dbpath")),
# user_config=os.path.expanduser("~/.config/regolith/user.json"),
# force=False,
# database=None
# )
DEFAULT_RC = {}
rc = DEFAULT_RC


# FileSystemClient methods tested here
def test_is_alive():
def test_is_alive(rc):
expected = True # filesystem is always alive!
fsc = FileSystemClient(rc)
actual = fsc.is_alive()

assert actual == expected


def test_open():
def test_open(rc):
fsc = FileSystemClient(rc)
fsc.open()

# assert fsc.dbs == rc.databases
actual = fsc.dbs
expected = connect_db(rc)[1]
assert actual == expected

assert isinstance(fsc.dbs, type(defaultdict(lambda: defaultdict(dict))))
assert isinstance(fsc.chained_db, type(dict()))
assert not fsc.closed


def test_close():
def test_close(rc):
fsc = FileSystemClient(rc)
assert fsc.open
# assert fsc.dbs == rc.databases

actual = fsc.dbs
expected = connect_db(rc)[1]
assert actual == expected

assert isinstance(fsc.dbs, type(defaultdict(lambda: defaultdict(dict))))

actual = fsc.close()
fsc.close()
assert fsc.dbs is None
assert fsc.closed

Expand Down Expand Up @@ -119,10 +117,42 @@ def test_all_documents():
pass


@pytest.mark.skip("Not written")
def test_insert_one():
pass
test_insert_cif = [({'intensity': [], 'q': [], 'ttheta': [], 'wavelength': 0.111111, '_id': 'ts1129'},
{'intensity': [], 'q': [], 'ttheta': [], 'wavelength': 0.111111, '_id': 'ts1129'})]
@pytest.mark.parametrize('input, result', test_insert_cif)
def test_insert_one(rc, input, result):
client = FileSystemClient(rc)
client.open()

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the code below seems to be testing connect_db, which should be tested in test_connect_db and not here. It is ok to use the function here to connect your db, but don't write tests for it here.

dbname = 'local'
collname = 'calculated'

client.load_database(pydr_rc['databases'][0])

client.insert_one(dbname, collname, input)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please check we have the right logic here. I think it should look like:

  1. take exemplar and create a filesystem database with exemplar somewhere in tempdir
  2. connect to this db
  3. insert_one into this db
  4. read the file from the tempdir file
  5. make sure it contains the original exemplar and additionally the thing that was added.

I don't see the read so I am not sure we are testing that the file was updated.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think since we are calling the rc fixture when running the test, the make_db and rc functions in conftest.py automatically generates a filesystem database with exemplar in tempdir. So if we print(client.dbs) after client.insert_one, the database contains {'local': {'calculated': {...EXEMPLAR, INSERTED_DATA}, {'measured': {...EXEMPLAR}}}}. Also this is what I originally intended to do when I wrote assert list(client.dbs[dbname][collname].values())[-1] == result.


assert list(client.dbs[dbname][collname].values())[-1] == result
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

per your question (which I don't see here for some reason) this may be left over from Regolith where the process is to load the entire database into memory (as client.dbs[dbname][collname]). It is then put back into the database at the end with a db.dump() or sthg like that. But I think we don't want to do that here as we will have 100,000 entries in the db. so we want insert_one to insert it all the way into the database backend. In the filesystem case this will be a text file on disc containing yml or json. in the mongo case it will be a remote mongodb and for mpcontribs it will be in mpcontribs.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, quick questions here. So it's a bad practice to load client.dbs[dbname][collname] entirely using the above.> What if we implement find_one first and use it to test insert_one? For example: assert client.find_one(inserted_cif_id) == inserted_cif?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could, but I am not convinced that it is the best way. We should presumably also test that nothing else in the file was corrupted or overwritten or sthg.

Also, as I mentioned before, we are not inserting cifs per se. I think insert_one should be able to insert any valid json. I think you agree, but by putting cif in the name it makes the test harder to read so I suggest changing cif to json in the name.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! Also another quick question, by inserting all the way into the database backend you mean overriding the yaml file directly instead of using client.load_db and client.dump_db? That means we have to change our insert_one function to directly access the yaml file?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I think that is right. Regolith assumed small databases that could be stored in memory, then operations on the items could be done with do loops and list comprehensions, but with large databases we want to use the powerful database capabilities directly, so we want kind of CRUD capabilities. We may probably want the fs to be json format not yml, but otherwise, yes, I think we want to insert directly.



test_insert_cif_bad = [{'bad_case_test_dict': 'bad'}, 'bad_case_test_str']
def test_insert_one_bad(rc):
client = FileSystemClient(rc)
client.open()

dbname = 'local'
collname = 'calculated'

client.load_database(pydr_rc['databases'][0])

with pytest.raises(KeyError) as excinfo:
client.insert_one(dbname, collname, test_insert_cif_bad[0])
assert '_id' in str(excinfo.value)
print(excinfo)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, how do we return info back to the user in pytest? Is print() enough? I couldn't find it on pytest docs...

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't want to return anything back to the user in the tests, we just want them to pass or fail. This pattern could work

def test_insert_one_bad():
    with pytest.raises(TypeError, match=r"Bad value in database entry key bad_entry_key"):
        insert_one(input[0])

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then we need logic in the function itself so it raises an exception with the right error message.


with pytest.raises(TypeError) as excinfo:
client.insert_one(dbname, collname, test_insert_cif_bad[1])
assert 'dict' not in str(excinfo.value)
print(excinfo)

@pytest.mark.skip("Not written")
def test_insert_many():
Expand Down
40 changes: 9 additions & 31 deletions tests/test_runcontrol.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,15 @@
from testfixtures import TempDirectory
from pathlib import Path

from pydatarecognition.runcontrol import DEFAULT_RC, load_rcfile, filter_databases, \
connect_db
from pydatarecognition.runcontrol import filter_databases, connect_db
from pydatarecognition.database import connect


pydr_rc = b"""
{
"groupname": "Billinge Group",
"databases": [
{
"name": "calculated",
"url": ".",
"public": false,
"path": "db",
"local": true
}
]
}
"""
def test_connect_db():
rc = copy.copy(DEFAULT_RC)

with TempDirectory() as d:
temp_dir = Path(d.path)
d.write(f"pydr_rc.json",
pydr_rc)
rc._update(load_rcfile(temp_dir / "pydr_rc.json"))
filter_databases(rc)
with connect(rc) as rc.client:
expected_dbs = rc.client.dbs
expected_chdb = rc.client.chained_db
chained_db, dbs = connect_db(rc)
assert chained_db == expected_chdb
assert dbs == expected_dbs
def test_connect_db(rc):
filter_databases(rc)
with connect(rc) as rc.client:
expected_dbs = rc.client.dbs
expected_chdb = rc.client.chained_db
dbs = connect_db(rc)
# assert chained_db == expected_chdb
assert dbs == expected_dbs