Skip to content

Commit

Permalink
Add order_by for paginated queries (data-dot-all#1249)
Browse files Browse the repository at this point in the history
### Feature or Bugfix
<!-- please choose -->
- Bugfix

### Detail
- This PR aims to solve the following

- (1) for particular queries (identified as ones that perform
`.outerjoin()` operations and have results paginated with `paginate()`
function - sometimes the returned query results is *less than* the limit
set by the pageSize of the paginate function even when the total count
is greater than the pageSize
- Ex 1: 11 envs total, `query_user_environments()` returning 9 envs on
1st page + 2 on 2nd page
- Ex 2: 10 envs total, `query_user_environments()` returning 9 envs on
1st page + no 2nd page

- Believe this is to be happening due to the way SQLAlchemy is
"uniquing" the records resulted from an outerjoin and then returning
that result back to the frontend

- Adding a `.distinct()` check on the query ensures each distinct record
is returned (tested successfully)

- (2) Currently we often times do not implement an `.order_by()`
condition for the query used in `paginate()` and do not have a stable
way of preserving order of the items returned from a query (i.e. when
navigating through pages of response)
- A generally good practice seems to include an `order_by()` on a column
or set of columns
- For each query used in `paginate()` this PR adds an `order_by()`
condition (full list in comments below)

Can read a bit more context from related issue linked below

### Relates
- data-dot-all#1241

### Security
Please answer the questions below briefly where applicable, or write
`N/A`. Based on
[OWASP 10](https://owasp.org/Top10/en/).

- Does this PR introduce or modify any input fields or queries - this
includes
fetching data from storage outside the application (e.g. a database, an
S3 bucket)?
  - Is the input sanitized?
- What precautions are you taking before deserializing the data you
consume?
  - Is injection prevented by parametrizing queries?
  - Have you ensured no `eval` or similar functions are used?
- Does this PR introduce any functionality or component that requires
authorization?
- How have you ensured it respects the existing AuthN/AuthZ mechanisms?
  - Are you logging failed auth attempts?
- Are you using or adding any cryptographic features?
  - Do you use a standard proven implementations?
  - Are the used keys controlled by the customer? Where are they stored?
- Are you introducing any new policies/roles/users?
  - Have you used the least-privilege principle? How?


By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license.
  • Loading branch information
noah-paige authored May 6, 2024
1 parent 98e67fa commit ed7cc3e
Show file tree
Hide file tree
Showing 15 changed files with 61 additions and 39 deletions.
16 changes: 8 additions & 8 deletions backend/dataall/core/environment/db/environment_repositories.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def query_all_environment_consumption_roles(session, uri, filter) -> Query:
ConsumptionRole.groupUri == group,
)
)
return query.order_by(ConsumptionRole.consumptionRoleUri)
return query.order_by(ConsumptionRole.consumptionRoleName)

@staticmethod
def query_user_environment_consumption_roles(session, groups, uri, filter) -> Query:
Expand All @@ -166,7 +166,7 @@ def query_user_environment_consumption_roles(session, groups, uri, filter) -> Qu
ConsumptionRole.groupUri == group,
)
)
return query.order_by(ConsumptionRole.consumptionRoleUri)
return query.order_by(ConsumptionRole.consumptionRoleName)

@staticmethod
def query_environment_invited_groups(session, uri, filter) -> Query:
Expand All @@ -190,7 +190,7 @@ def query_environment_invited_groups(session, uri, filter) -> Query:
EnvironmentGroup.groupUri.ilike('%' + term + '%'),
)
)
return query
return query.order_by(EnvironmentGroup.groupUri)

@staticmethod
def query_user_environment_groups(session, groups, uri, filter) -> Query:
Expand All @@ -206,7 +206,7 @@ def query_user_environment_groups(session, groups, uri, filter) -> Query:
EnvironmentGroup.groupUri.ilike('%' + term + '%'),
)
)
return query
return query.order_by(EnvironmentGroup.groupUri)

@staticmethod
def query_all_environment_groups(session, uri, filter) -> Query:
Expand All @@ -218,7 +218,7 @@ def query_all_environment_groups(session, uri, filter) -> Query:
EnvironmentGroup.groupUri.ilike('%' + term + '%'),
)
)
return query
return query.order_by(EnvironmentGroup.groupUri)

@staticmethod
def query_user_consumption_roles(session, username, groups, filter) -> Query:
Expand All @@ -242,7 +242,7 @@ def query_user_consumption_roles(session, username, groups, filter) -> Query:
ConsumptionRole.groupUri == group,
)
)
return query
return query.order_by(ConsumptionRole.consumptionRoleName)

@staticmethod
def query_user_groups(session, username, groups, filter) -> Query:
Expand All @@ -258,7 +258,7 @@ def query_user_groups(session, username, groups, filter) -> Query:
EnvironmentGroup.groupUri.ilike('%' + term + '%'),
)
)
return query
return query.order_by(EnvironmentGroup.groupUri)

@staticmethod
def query_user_environments(session, username, groups, filter) -> Query:
Expand Down Expand Up @@ -287,7 +287,7 @@ def query_user_environments(session, username, groups, filter) -> Query:
)
if filter and filter.get('SamlGroupName') and filter.get('SamlGroupName') in groups:
query = query.filter(EnvironmentGroup.groupUri == filter.get('SamlGroupName'))
return query
return query.order_by(Environment.label).distinct()

@staticmethod
def is_user_invited_to_environment(session, groups, uri):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def query_user_organizations(session, username, groups, filter) -> Query:
models.Organization.tags.contains(f"{{{filter.get('term')}}}"),
)
)
return query
return query.order_by(models.Organization.label).distinct()

@staticmethod
def paginated_user_organizations(session, data=None) -> dict:
Expand All @@ -69,7 +69,7 @@ def query_organization_environments(session, uri, filter) -> Query:
Environment.description.ilike('%' + filter.get('term') + '%'),
)
)
return query
return query.order_by(Environment.label)

@staticmethod
def paginated_organization_environments(session, uri, data=None) -> dict:
Expand Down Expand Up @@ -104,7 +104,7 @@ def query_organization_groups(session, uri, filter) -> Query:
models.OrganizationGroup.groupUri.ilike('%' + filter.get('term') + '%'),
)
)
return query
return query.order_by(models.OrganizationGroup.groupUri)

@staticmethod
def paginated_organization_groups(session, uri, data=None) -> dict:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def list_tenant_groups(session, data=None):
query = query.filter(TenantPolicy.principalId.ilike('%' + data.get('term') + '%'))

return paginate(
query=query,
query=query.order_by(TenantPolicy.principalId),
page=data.get('page', 1),
page_size=data.get('pageSize', 10),
).to_dict()
2 changes: 1 addition & 1 deletion backend/dataall/core/vpc/db/vpc_repositories.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,4 @@ def query_environment_networks(session, uri, filter):
Vpc.VpcId.ilike('%' + term + '%'),
)
)
return query
return query.order_by(Vpc.label)
14 changes: 10 additions & 4 deletions backend/dataall/modules/catalog/db/glossary_repositories.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@ def list_glossaries(session, data=None):
GlossaryNode.readme.ilike('%' + term + '%'),
)
)
return paginate(q, page_size=data.get('pageSize', 10), page=data.get('page', 1)).to_dict()
return paginate(
q.order_by(GlossaryNode.label), page_size=data.get('pageSize', 10), page=data.get('page', 1)
).to_dict()

@staticmethod
def list_node_children(session, path, filter):
Expand Down Expand Up @@ -259,7 +261,9 @@ def list_categories(session, uri, data=None):
GlossaryNode.readme.ilike(term),
)
)
return paginate(q, page=data.get('page', 1), page_size=data.get('pageSize', 10)).to_dict()
return paginate(
q.order_by(GlossaryNode.label), page=data.get('page', 1), page_size=data.get('pageSize', 10)
).to_dict()

@staticmethod
def list_terms(session, uri, data=None):
Expand All @@ -278,7 +282,9 @@ def list_terms(session, uri, data=None):
GlossaryNode.readme.ilike(term),
)
)
return paginate(q, page=data.get('page', 1), page_size=data.get('pageSize', 10)).to_dict()
return paginate(
q.order_by(GlossaryNode.label), page=data.get('page', 1), page_size=data.get('pageSize', 10)
).to_dict()

@staticmethod
def get_node(session, uri) -> GlossaryNode:
Expand Down Expand Up @@ -399,7 +405,7 @@ def get_glossary_terms_links(session, target_uri, target_type):
TermLink.targetType == target_type,
)
)
)
).order_by(GlossaryNode.path)

return paginate(terms, page_size=10000, page=1).to_dict()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def _query_user_dashboards(session, username, groups, filter) -> Query:
Dashboard.label.ilike(filter.get('term') + '%%'),
)
)
return query
return query.order_by(Dashboard.label).distinct()

@staticmethod
def paginated_user_dashboards(session, username, groups, data=None) -> dict:
Expand Down Expand Up @@ -110,7 +110,7 @@ def _query_dashboard_shares(session, username, groups, uri, filter) -> Query:
Dashboard.label.ilike(filter.get('term') + '%%'),
)
)
return query
return query.order_by(DashboardShare.shareUri)

@staticmethod
def query_all_user_groups_shareddashboard(session, groups, uri) -> [str]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def query_user_pipelines(session, username, groups, filter) -> Query:
if filter and filter.get('type'):
if len(filter.get('type')) > 0:
query = query.filter(DataPipeline.devStrategy.in_(filter.get('type')))
return query
return query.order_by(DataPipeline.label)

@staticmethod
def paginated_user_pipelines(session, username, groups, data=None) -> dict:
Expand All @@ -150,7 +150,7 @@ def query_pipeline_environments(session, uri) -> Query:
query = session.query(DataPipelineEnvironment).filter(
DataPipelineEnvironment.pipelineUri.ilike(uri + '%%'),
)
return query
return query.order_by(DataPipelineEnvironment.stage)

@staticmethod
def paginated_pipeline_environments(session, uri, data=None) -> dict:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,9 @@ def list_shareable_items(session, share, states, data):
else query.filter(shareable_objects.c.healthStatus != ShareItemHealthStatus.Healthy.value)
)

return paginate(query, data.get('page', 1), data.get('pageSize', 10)).to_dict()
return paginate(
query.order_by(shareable_objects.c.itemName).distinct(), data.get('page', 1), data.get('pageSize', 10)
).to_dict()

@staticmethod
def list_user_received_share_requests(session, username, groups, data=None):
Expand Down Expand Up @@ -629,7 +631,7 @@ def list_user_received_share_requests(session, username, groups, data=None):
if data and data.get('share_iam_roles'):
if len(data.get('share_iam_roles')) > 0:
query = query.filter(ShareObject.principalIAMRoleName.in_(data.get('share_iam_roles')))
return paginate(query, data.get('page', 1), data.get('pageSize', 10)).to_dict()
return paginate(query.order_by(ShareObject.shareUri), data.get('page', 1), data.get('pageSize', 10)).to_dict()

@staticmethod
def list_user_sent_share_requests(session, username, groups, data=None):
Expand Down Expand Up @@ -668,7 +670,7 @@ def list_user_sent_share_requests(session, username, groups, data=None):
if data and data.get('share_iam_roles'):
if len(data.get('share_iam_roles')) > 0:
query = query.filter(ShareObject.principalIAMRoleName.in_(data.get('share_iam_roles')))
return paginate(query, data.get('page', 1), data.get('pageSize', 10)).to_dict()
return paginate(query.order_by(ShareObject.shareUri), data.get('page', 1), data.get('pageSize', 10)).to_dict()

@staticmethod
def get_share_by_dataset_and_environment(session, dataset_uri, environment_uri):
Expand Down Expand Up @@ -1001,11 +1003,15 @@ def find_dataset_shares(session, dataset_uri):

@staticmethod
def query_dataset_shares(session, dataset_uri) -> Query:
return session.query(ShareObject).filter(
and_(
ShareObject.datasetUri == dataset_uri,
ShareObject.deleted.is_(None),
return (
session.query(ShareObject)
.filter(
and_(
ShareObject.datasetUri == dataset_uri,
ShareObject.deleted.is_(None),
)
)
.order_by(ShareObject.shareUri)
)

@staticmethod
Expand Down Expand Up @@ -1157,7 +1163,9 @@ def paginate_shared_datasets(session, env_uri, data):

if data.get('uniqueShares', False):
q = q.filter(ShareObject.principalType != PrincipalType.ConsumptionRole.value)
q = q.distinct(ShareObject.shareUri)
q = q.order_by(ShareObject.shareUri).distinct(ShareObject.shareUri)
else:
q = q.order_by(ShareObjectItem.itemName).distinct()

if data.get('term'):
term = data.get('term')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def resolve_terms(context, source: DatasetTableColumn, **kwargs):
return None
with context.engine.scoped_session() as session:
q = session.query(TermLink).filter(TermLink.targetUri == source.columnUri)
return paginate(q, page=1, page_size=15).to_dict()
return paginate(q.order_by(TermLink.linkUri), page=1, page_size=15).to_dict()


def update_table_column(context: Context, source, columnUri: str = None, input: dict = None):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,4 +113,8 @@ def paginated_dataset_locations(session, uri, data=None) -> dict:
]
)
)
return paginate(query=query, page_size=data.get('pageSize', 10), page=data.get('page', 1)).to_dict()
return paginate(
query=query.order_by(DatasetStorageLocation.label),
page_size=data.get('pageSize', 10),
page=data.get('page', 1),
).to_dict()
8 changes: 4 additions & 4 deletions backend/dataall/modules/datasets/db/dataset_repositories.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def _query_all_user_datasets(session, username, groups, all_subqueries, filter)
Dataset.label.ilike(filter.get('term') + '%%'),
)
)
return query.distinct(Dataset.datasetUri)
return query.order_by(Dataset.label).distinct(Dataset.datasetUri, Dataset.label)

@staticmethod
def paginated_dataset_tables(session, uri, data=None) -> dict:
Expand Down Expand Up @@ -279,7 +279,7 @@ def query_environment_group_datasets(session, env_uri, group_uri, filter) -> Que
Dataset.region.ilike('%' + term + '%'),
)
)
return query
return query.order_by(Dataset.label)

@staticmethod
def query_environment_datasets(session, uri, filter) -> Query:
Expand All @@ -299,7 +299,7 @@ def query_environment_datasets(session, uri, filter) -> Query:
Dataset.region.ilike('%' + term + '%'),
)
)
return query
return query.order_by(Dataset.label)

@staticmethod
def query_environment_imported_datasets(session, uri, filter) -> Query:
Expand Down Expand Up @@ -375,7 +375,7 @@ def _query_user_datasets(session, username, groups, filter) -> Query:
Dataset.label.ilike(filter.get('term') + '%%'),
)
)
return query.distinct(Dataset.datasetUri)
return query.order_by(Dataset.label).distinct(Dataset.datasetUri, Dataset.label)

@staticmethod
def _set_import_data(dataset, data):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _query_user_sagemaker_studio_users(session, username, groups, filter) -> Que
SagemakerStudioUser.label.ilike(filter.get('term') + '%%'),
)
)
return query
return query.order_by(SagemakerStudioUser.label)

@staticmethod
def paginated_sagemaker_studio_users(session, username, groups, filter={}) -> dict:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def _query_user_notebooks(self, username, groups, filter) -> Query:
SagemakerNotebook.label.ilike(filter.get('term') + '%%'),
)
)
return query
return query.order_by(SagemakerNotebook.label)

def count_resources(self, environment_uri, group_uri):
return (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,11 @@ def paginated_notifications(session, username, groups, filter=None):
)
if filter.get('archived'):
q = q.filter(models.Notification.deleted.isnot(None))
return paginate(q, page=filter.get('page', 1), page_size=filter.get('pageSize', 20)).to_dict()
return paginate(
q.order_by(models.Notification.created.desc()),
page=filter.get('page', 1),
page_size=filter.get('pageSize', 20),
).to_dict()

@staticmethod
def count_unread_notifications(session, username, groups):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def query_user_worksheets(session, username, groups, filter) -> Query:
Worksheet.tags.contains(f"{{{filter.get('term')}}}"),
)
)
return query
return query.order_by(Worksheet.label)

@staticmethod
def paginated_user_worksheets(session, username, groups, uri, data=None, check_perm=None) -> dict:
Expand Down

0 comments on commit ed7cc3e

Please sign in to comment.