Skip to content

Commit

Permalink
File: Allow selecting a file with an arbitrary extension
Browse files Browse the repository at this point in the history
  • Loading branch information
janezd committed Jan 19, 2025
1 parent f3d9e19 commit 5cd773b
Show file tree
Hide file tree
Showing 3 changed files with 188 additions and 34 deletions.
87 changes: 64 additions & 23 deletions Orange/widgets/data/owfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
# module's namespace so that old saved settings still work
from Orange.widgets.utils.filedialogs import RecentPath

DEFAULT_READER_TEXT = "Automatically detect type"
DEFAULT_READER_TEXT = "Determine type from the file extension"

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -147,8 +147,11 @@ class Warning(widget.OWWidget.Warning):
class Error(widget.OWWidget.Error):
file_not_found = Msg("File not found.")
missing_reader = Msg("Missing reader.")
select_file_type = Msg("Select file type.")
sheet_error = Msg("Error listing available sheets.")
unknown = Msg("Read error:\n{}")
unknown_select = Msg(
"Read error, possibly due to incorrect choice of file type:\n{}")

UserAdviceMessages = [
widget.Message(
Expand Down Expand Up @@ -264,7 +267,7 @@ def package(w):
self.reader_combo = QComboBox(self)
self.reader_combo.setSizePolicy(Policy.Expanding, Policy.Fixed)
self.reader_combo.setMinimumSize(QSize(100, 1))
self.reader_combo.activated[int].connect(self.select_reader)
self.reader_combo.activated[int].connect(self.on_reader_change)

box.layout().addWidget(self.reader_combo)
layout.addWidget(box, 0, 1)
Expand Down Expand Up @@ -327,6 +330,10 @@ def select_sheet(self):
self.recent_paths[0].sheet = self.sheet_combo.currentText()
self.load_data()

def on_reader_change(self, n):
self.select_reader(n)
self.load_data()

def select_reader(self, n):
if self.source != self.LOCAL_FILE:
return # ignore for URL's
Expand All @@ -335,14 +342,11 @@ def select_reader(self, n):
path = self.recent_paths[0]
if n == 0: # default
path.file_format = None
self.load_data()
elif n <= len(self.available_readers):
reader = self.available_readers[n - 1]
path.file_format = reader.qualified_name()
self.load_data()
else: # the rest include just qualified names
path.file_format = self.reader_combo.itemText(n)
self.load_data()

def _url_set(self):
index = self.url_combo.currentIndex()
Expand Down Expand Up @@ -373,7 +377,9 @@ def browse_file(self, in_demos=False):
else:
start_file = self.last_path() or os.path.expanduser("~/")

filename, reader, _ = open_filename_dialog(start_file, None, self.available_readers)
filename, reader, _ = open_filename_dialog(
start_file, None, self.available_readers,
add_all="*")
if not filename:
return
self.add_path(filename)
Expand Down Expand Up @@ -415,20 +421,20 @@ def _try_load(self):
if not url:
return self.Information.no_file_selected

def mark_problematic_reader():
self.reader_combo.setItemData(self.reader_combo.currentIndex(),
QBrush(Qt.red), Qt.ForegroundRole)

try:
self.reader = self._get_reader() # also sets current reader index
assert self.reader is not None
except MissingReaderException:
mark_problematic_reader()
return self.Error.missing_reader
if self.reader_combo.currentIndex() > 0:
return self.Error.missing_reader
else:
return self.Error.select_file_type
except Exception as ex:
mark_problematic_reader()
log.exception(ex)
return lambda x=ex: self.Error.unknown(str(x))
if self.reader_combo.currentIndex() > 0:
return lambda x=ex: self.Error.unknown(str(x))
else:
return lambda x=ex: self.Error.unknown_select(str(x))

try:
self._update_sheet_combo()
Expand All @@ -439,7 +445,6 @@ def mark_problematic_reader():
try:
data = self.reader.read()
except Exception as ex:
mark_problematic_reader()
log.exception(ex)
return lambda x=ex: self.Error.unknown(str(x))
if warnings:
Expand All @@ -455,9 +460,25 @@ def mark_problematic_reader():
return None

def _get_reader(self) -> FileFormat:
"""
Get the reader for the current file.
For local files, this also observes the stored settings and the reader
combo, as follows:
1. If the file format is known (from stored settings), use it and set
the reader combo to the corresponding index (as in settings)
2. Otherwise, detect it from the extension and set the combo to
Auto detect, overriding any previous user-set choice
3. Otherwise, use the current combo state.
Returns:
FileFormat: reader instance
"""
if self.source == self.LOCAL_FILE:
path = self.last_path()
self.reader_combo.setEnabled(True)

if self.recent_paths and self.recent_paths[0].file_format:
qname = self.recent_paths[0].file_format
qname_index = {r.qualified_name(): i for i, r in enumerate(self.available_readers)}
Expand All @@ -473,9 +494,20 @@ def _get_reader(self) -> FileFormat:
except Exception as ex:
raise MissingReaderException(f'Can not find reader "{qname}"') from ex
reader = reader_class(path)

else:
self.reader_combo.setCurrentIndex(0)
reader = FileFormat.get_reader(path)
old_idx = self.reader_combo.currentIndex()
try:
self.reader_combo.setCurrentIndex(0)
reader = FileFormat.get_reader(path)
except MissingReaderException:
if old_idx == 0:
raise
# Set the path for the current file format,
# and repeat the call to return the corresponding reader
self.select_reader(old_idx)
return self._get_reader()

if self.recent_paths and self.recent_paths[0].sheet:
reader.select_sheet(self.recent_paths[0].sheet)
return reader
Expand Down Expand Up @@ -504,12 +536,21 @@ def _select_active_sheet(self):
self.sheet_combo.setCurrentIndex(0)

def _initialize_reader_combo(self):
self.reader_combo.clear()
filters = [format_filter(f) for f in self.available_readers]
self.reader_combo.addItems([DEFAULT_READER_TEXT] + filters)
self.reader_combo.setCurrentIndex(0)
self.reader_combo.setDisabled(True)
# additional readers may be added in self._get_reader()
# Reset to initial state without losing the current index or
# emitting any signals.
combo = self.reader_combo
if not combo.count():
filters = [format_filter(f) for f in self.available_readers]
combo.addItems([DEFAULT_READER_TEXT] + filters)
combo.setCurrentIndex(0)
else:
# additional readers may be added in self._get_reader()
n = len(self.available_readers) + 1
if combo.currentIndex() >= n:
combo.setCurrentIndex(0)
while combo.count() > n:
combo.removeItem(combo.count() - 1)
combo.setDisabled(True)

@staticmethod
def _describe(table):
Expand Down
128 changes: 119 additions & 9 deletions Orange/widgets/data/tests/test_owfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
Domain, DiscreteVariable, ContinuousVariable
from Orange.util import OrangeDeprecationWarning

from Orange.data.io import TabReader
from Orange.data.io import TabReader, XlsReader
from Orange.tests import named_file
from Orange.widgets.data.owfile import OWFile, OWFileDropHandler, DEFAULT_READER_TEXT
from Orange.widgets.utils.filedialogs import dialog_formats, format_filter, RecentPath
Expand Down Expand Up @@ -361,13 +361,13 @@ def test_reader_custom_tab(self):
outdata = self.get_output(self.widget.Outputs.data)
self.assertEqual(len(outdata), 150) # loaded iris

def test_no_reader_extension(self):
def test_unknown_extension(self):
with named_file("", suffix=".xyz_unknown") as fn:
no_reader = RecentPath(fn, None, None)
self.widget = self.create_widget(OWFile,
stored_settings={"recent_paths": [no_reader]})
self.widget.load_data()
self.assertTrue(self.widget.Error.missing_reader.is_shown())
self.assertTrue(self.widget.Error.select_file_type.is_shown())

def test_fail_sheets(self):
with named_file("", suffix=".failed_sheet") as fn:
Expand Down Expand Up @@ -418,6 +418,22 @@ def test_no_specified_reader(self):
self.assertTrue(self.widget.Error.missing_reader.is_shown())
self.assertEqual(self.widget.reader_combo.currentText(), "not.a.file.reader.class")


def _select_reader(self, name):
reader_combo = self.widget.reader_combo
len_with_qname = len(reader_combo)
for i in range(len_with_qname):
text = reader_combo.itemText(i)
if text.startswith(name):
break
else:
assert f"No reader starts with {name!r}"
reader_combo.setCurrentIndex(i)
reader_combo.activated.emit(i)

def _select_tab_reader(self):
self._select_reader("Tab-separated")

def test_select_reader(self):
filename = FileFormat.locate("iris.tab", dataset_dirs)

Expand All @@ -436,12 +452,7 @@ def test_select_reader(self):
self.assertEqual(self.widget.reader_combo.currentText(), "not.a.file.reader.class")
self.assertEqual(self.widget.reader, None)

# select the tab reader
for i in range(len_with_qname):
text = self.widget.reader_combo.itemText(i)
if text.startswith("Tab-separated"):
break
self.widget.reader_combo.activated.emit(i)
self._select_tab_reader()
self.assertEqual(len(self.widget.reader_combo), len_with_qname - 1)
self.assertTrue(self.widget.reader_combo.currentText().startswith("Tab-separated"))
self.assertIsInstance(self.widget.reader, TabReader)
Expand All @@ -452,6 +463,105 @@ def test_select_reader(self):
self.assertEqual(self.widget.reader_combo.currentText(), DEFAULT_READER_TEXT)
self.assertIsInstance(self.widget.reader, TabReader)

def test_auto_detect_and_override(self):
tab_as_xlsx = FileFormat.locate("actually-a-tab-file.xlsx", dataset_dirs)
iris = FileFormat.locate("iris", dataset_dirs)

reader_combo = self.widget.reader_combo

reader_combo.setCurrentIndex(0)
reader_combo.activated.emit(0)
assert (self.widget.reader_combo.currentText()
== "Determine type from the file extension")

def open_file(_a, _b, _c, filters, _e):
return filename, filters.split(";;")[0]

with patch("AnyQt.QtWidgets.QFileDialog.getOpenFileName",
open_file):

# Loading a tab file with extension xlsx fails with auto-detect
filename = tab_as_xlsx
self.widget.browse_file()

self.assertEqual(self.widget.reader_combo.currentText(),
"Determine type from the file extension")
self.assertTrue(self.widget.Error.unknown_select.is_shown())
self.assertIsNone(self.get_output(self.widget.Outputs.data))

# Select the tab reader: it should work
self._select_tab_reader()
assert "Tab-separated" in self.widget.reader_combo.currentText()

self.assertFalse(self.widget.Error.unknown_select.is_shown())
self.assertIsInstance(self.widget.reader, TabReader)
self.assertIsNotNone(self.get_output(self.widget.Outputs.data))

# Switching to iris resets the combo to auto-detect
filename = iris
self.widget.browse_file()
self.assertEqual(self.widget.reader_combo.currentText(),
"Determine type from the file extension")
self.assertIsNotNone(self.get_output(self.widget.Outputs.data))

# Taking the tab-as-xlsx file from recent paths should restore
# the file type for that file
self.widget.file_combo.setCurrentIndex(1)
self.widget.file_combo.activated.emit(1)
self.assertIn("Tab-separated", self.widget.reader_combo.currentText())
self.assertIsNotNone(self.get_output(self.widget.Outputs.data))

# Reloading should work
self.widget.load_data()
self.assertIn("Tab-separated", self.widget.reader_combo.currentText())
self.assertIsNotNone(self.get_output(self.widget.Outputs.data))

# Loading this file - not from history - should fail
filename = tab_as_xlsx
self.widget.browse_file()
self.assertTrue(self.widget.Error.unknown_select.is_shown())
self.assertIsNone(self.get_output(self.widget.Outputs.data))

# Set the correct type again (preparation for the next text block)
self._select_tab_reader()
assert not self.widget.Error.unknown_select.is_shown()
assert isinstance(self.widget.reader, TabReader)
assert self.get_output(self.widget.Outputs.data) is not None

# Now load a real Excel file: this is a known excention so the combo
# should return to auto-detect
filename = FileFormat.locate("an_excel_file.xlsx", dataset_dirs)
self.widget.browse_file()
self.assertEqual(self.widget.reader_combo.currentText(),
"Determine type from the file extension")
self.assertFalse(self.widget.Error.unknown_select.is_shown())
self.assertIsNotNone(self.get_output(self.widget.Outputs.data))

# Load iris to prepare for the next test block
filename = iris
self.widget.browse_file()
assert (self.widget.reader_combo.currentText()
== "Determine type from the file extension")
assert self.get_output(self.widget.Outputs.data) is not None

# Files with unknown extensions require manual selection
filename = FileFormat.locate("an_excel_file.foo", dataset_dirs)
self.widget.browse_file()
self.assertTrue(self.widget.Error.select_file_type.is_shown())
self.assertIsNone(self.get_output(self.widget.Outputs.data))

self._select_reader("Excel")
self.assertFalse(self.widget.Error.unknown_select.is_shown())
self.assertFalse(self.widget.Error.select_file_type.is_shown())
self.assertIsNotNone(self.get_output(self.widget.Outputs.data))

# Consecutive loading of files with the same extension keeps selection
filename = FileFormat.locate("an_excel_file-too.foo", dataset_dirs)
self.widget.browse_file()
self.assertFalse(self.widget.Error.unknown_select.is_shown())
self.assertFalse(self.widget.Error.select_file_type.is_shown())
self.assertIsNotNone(self.get_output(self.widget.Outputs.data))

def test_select_reader_errors(self):
filename = FileFormat.locate("iris.tab", dataset_dirs)

Expand Down
7 changes: 5 additions & 2 deletions i18n/si/msgs.jaml
Original file line number Diff line number Diff line change
Expand Up @@ -6161,7 +6161,7 @@ widgets/data/owfeaturestatistics.py:
__main__: false
iris: false
widgets/data/owfile.py:
Automatically detect type: Samodejno zaznaj vrsto datoteke
Determine type from the file extension: Določi vrsto iz končnice datoteke
def `add_origin`:
type: false
origin: false
Expand Down Expand Up @@ -6196,8 +6196,10 @@ widgets/data/owfile.py:
class `Error`:
File not found.: Datoteka ni najdena.
Missing reader.: Bralnik za ta tip ne obstaja.
Select file type.: Izberite vrsto datoteke.
Error listing available sheets.: Napaka ob ustvarjanju seznama listov.
Read error:\n{}: Napaka ob branju:\n{}
Read error, possibly due to incorrect choice of file type:\n{}: Napaka ob branju, morda zaradi napačne izbire vrste datoteke:\n{}
'Use CSV File Import widget for advanced options ': 'Uporabi Bralnik CSV za napredne možnosti '
for comma-separated files: za datoteke ločene z vejico
use-csv-file-import: falxe
Expand Down Expand Up @@ -6232,10 +6234,11 @@ widgets/data/owfile.py:
File: Datoteka
Cannot find the directory with documentation datasets: Ne najdem mape s podatki iz dokumentacije
~/: false
*: false
def `load_data`:
No data.: Ni podatkov.
def `_get_reader`:
Can not find reader "{qname}": Bralnik "{qname}" ne obstaja.
Can not find reader "{qname}": Ne najdem bralnika "{qname}"
def `_describe`:
attributes: false
Name: Ime
Expand Down

0 comments on commit 5cd773b

Please sign in to comment.