Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add kgtk import-csv command #702

Open
wants to merge 7 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 206 additions & 0 deletions docs/transform/import-csv.md

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion docs/transform/normalize_nodes.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ usage: kgtk normalize-nodes [-h] [-i INPUT_FILE] [-o OUTPUT_FILE]
[--id-column ID_COLUMN_NAME]
[-v [optional True|False]]

Normalize a KGTK node file into a KGTK edge file with a row for each column value in the input file.
If called as "kgtk normalize-nodes", Normalize a KGTK node file into a KGTK edge file with a row for each column value in the input file.
If called as "kgtk import-csv", the input file is assumed to be a CSV file.

optional arguments:
-h, --help show this help message and exit
Expand Down
389 changes: 389 additions & 0 deletions examples/commands/import-csv.ipynb

Large diffs are not rendered by default.

30 changes: 28 additions & 2 deletions kgtk/cli/normalize_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,19 @@
from kgtk.kgtkformat import KgtkFormat
from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions, KgtkReaderMode
from kgtk.io.kgtkwriter import KgtkWriter
from kgtk.reshape.kgtkidbuilder import KgtkIdBuilder, KgtkIdBuilderOptions
from kgtk.utils.argparsehelpers import optional_bool
from kgtk.value.kgtkvalue import KgtkValue
from kgtk.value.kgtkvalueoptions import KgtkValueOptions

IMPORT_CSV_COMMAND = 'import-csv'

def parser():
return {
'aliases': [ IMPORT_CSV_COMMAND ],
'help': 'Normalize a KGTK node file into a KGTK edge file.',
'description': 'Normalize a KGTK node file into a KGTK edge file with a row for each column value in the input file.'
'description': 'If called as "kgtk normalize-nodes", Normalize a KGTK node file into a KGTK edge file with a row for each column value in the input file.\n' +
'If called as "kgtk import-csv", the input file is assumed to be a CSV file.'
}


Expand All @@ -30,6 +35,7 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names

_expert: bool = parsed_shared_args._expert
_mode: str = parsed_shared_args._mode
_command: str = parsed_shared_args._command

parser.add_input_file(positional=True)
parser.add_output_file()
Expand All @@ -50,12 +56,27 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names
expert=_expert)
KgtkValueOptions.add_arguments(parser, expert=_expert)

if _command == IMPORT_CSV_COMMAND:
parser.prog = IMPORT_CSV_COMMAND
parser.set_defaults(
input_format=KgtkReaderOptions.INPUT_FORMAT_CSV,
mode=KgtkReaderMode.NONE,
column_separator=KgtkFormat.CSV_COLUMN_SEPARATOR,
)

parser.add_argument("--add-id", action="store_true", dest="add_id",
help="Add an id column to the output. (default=%(default)s)", default=False)

KgtkIdBuilderOptions.add_arguments(parser, expert=True) # Show all the options.


def run(input_file: KGTKFiles,
output_file: KGTKFiles,

columns: typing.Optional[typing.List[str]] = None,
labels: typing.Optional[typing.List[str]] = None,
id_column_name: typing.Optional[str] = None,
add_id: typing.Optional[bool] = None,

errors_to_stdout: bool = False,
errors_to_stderr: bool = True,
Expand All @@ -79,6 +100,7 @@ def run(input_file: KGTKFiles,
# Build the option structures.
reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs)
value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs)
idbuilder_options: KgtkIdBuilderOptions = KgtkIdBuilderOptions.from_dict(kwargs)

# Show the final option structures for debugging and documentation.
if show_options:
Expand Down Expand Up @@ -120,8 +142,9 @@ def run(input_file: KGTKFiles,
verbose=verbose,
very_verbose=very_verbose,
)
idb: KgtkIdBuilder = KgtkIdBuilder.new(kr, idbuilder_options)

id_column_idx: int = kr.get_id_column_index(id_column_name)
id_column_idx: int = idb.new_id_column_idx if add_id else kr.get_id_column_index(id_column_name)
if id_column_idx < 0:
raise KGTKException("Unknown ID column %s" % repr(id_column_name))

Expand All @@ -141,6 +164,9 @@ def run(input_file: KGTKFiles,
for row in kr:
input_line_count += 1

if add_id:
row = idb.build(row, input_line_count)

node1_value: str = row[id_column_idx]

column_idx: int
Expand Down
14 changes: 10 additions & 4 deletions kgtk/io/kgtkreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1195,7 +1195,7 @@ def _build_column_names(cls,

# Split the first line into column names.
if input_format == KgtkReaderOptions.INPUT_FORMAT_CSV:
column_names = cls.csvsplit(header)
column_names = cls.csvsplit(header, options.column_separator)
if options.unquote_csv_column_names:
# TODO: Handle the troublesome case of a double quote inside a column
# name.
Expand Down Expand Up @@ -1271,12 +1271,18 @@ def reject(self, line):
print("%s" % line, file=self.reject_file)

@classmethod
def csvsplit(cls, line: str)->typing.List[str]:
def csvsplit(cls, line: str, column_separator: str)->typing.List[str]:
row: typing.List[str] = [ ]
item: str = ""
c: str
instring: bool = False
sawstring:bool = False
column_separator: str = (
column_separator
if column_separator != KgtkFormat.COLUMN_SEPARATOR
else KgtkFormat.CSV_COLUMN_SEPARATOR
)

for c in line:
if instring:
if c == '"':
Expand All @@ -1290,7 +1296,7 @@ def csvsplit(cls, line: str)->typing.List[str]:
instring = True
if sawstring:
item += c
elif c == ",":
elif c == column_separator:
if sawstring:
row.append(KgtkFormat.stringify(item))
else:
Expand Down Expand Up @@ -1389,7 +1395,7 @@ def nextrow(self)-> typing.List[str]:
continue

if self.input_format == KgtkReaderOptions.INPUT_FORMAT_CSV:
row = self.csvsplit(line)
row = self.csvsplit(line, self.options.column_separator)
else:
row = line.split(self.options.column_separator)

Expand Down
1 change: 1 addition & 0 deletions kgtk/kgtkformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

class KgtkFormat:
COLUMN_SEPARATOR: str = "\t"
CSV_COLUMN_SEPARATOR: str = ","
COMMENT_INDICATOR: str = "#"
LIST_SEPARATOR: str = "|"

Expand Down
24 changes: 24 additions & 0 deletions tests/data/Calendar_2018_geopoint.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
Bulan;Nama_Event;Nama_Sirkuit;Lat;Long
14 November 2017;Valencia MotoGP™ Official Test;Circuit Ricardo Tormo SPAIN;39.486863;-0.629870
28 January 2018;Sepang MotoGP™ Official Test;Sepang International Circuit MALAYSIA;2.759414;101.731778
16 February 2018;Buriram MotoGP™ Official Test;Buriram International Circuit THAILAND;14.957971;103.084925
1 March 2018;Qatar MotoGP™ Official Test;Losail International Circuit QATAR;25.486109;51.452779
18 March 2018;1 - Grand Prix of Qatar;Losail International Circuit QATAR;25.486109;51.452779
8 April 2018;2 - Gran Premio Motul de la República Argentina;Termas de Río Hondo ARGENTINA;-27.498557;-64.860536
22 April 2018;3 - Red Bull Grand Prix of The Americas;Circuit Of The Americas UNITED STATES;30.134581;-97.635851
6 May 2018;4 - Gran Premio Red Bull de España;Circuito de Jerez SPAIN;36.709299;-6.033878
20 May 2018;5 - HJC Helmets Grand Prix de France;Le Mans FRANCE;47.953730;0.213367
3 June 2018;6 - Gran Premio d'Italia Oakley;Autodromo del Mugello ITALY;43.997526;11.371879
17 June 2018;7 - Gran Premi Monster Energy de Catalunya;Circuit de Barcelona-Catalunya SPAIN;41.568227;2.257149
1 July 2018;8 - Motul TT Assen;TT Circuit Assen NETHERLANDS;52.958301;6.522342
15 July 2018;9 - GoPro Motorrad Grand Prix Deutschland;Sachsenring GERMANY;50.791726;12.688303
5 August 2018;10 - Monster Energy Grand Prix České republiky;Automotodrom Brno CZECH REPUBLIC;49.203887;16.445662
12 August 2018;11 - Motorrad Grand Prix von Österreich;Red Bull Ring – Spielberg AUSTRIA;47.220244;14.764848
26 August 2018;12 - Octo British Grand Prix;Silverstone Circuit GREAT BRITAIN;52.073301;-1.014663
9 September 2018;13 - Gran Premio Tribul Mastercard di San Marino e della Riviera di Rimini;Misano World Circuit Marco Simoncelli ITALY;43.961917;12.684440
23 September 2018;14 - Gran Premio Movistar de Aragón;MotorLand Aragon SPAIN;41.078300;-0.204535
7 October 2018;15 - PTT Thailand Grand Prix;Buriram International Circuit THAILAND;14.957971;103.084925
21 October 2018;16 - Motul Grand Prix of Japan;Twin Ring Motegi JAPAN;36.533385;140.227392
28 October 2018;17 - Australian Motorcycle Grand Prix;Phillip Island AUSTRALIA;-38.499915;145.234356
4 November 2018;18 - Shell Malaysia Motorcycle Grand Prix;Sepang International Circuit MALAYSIA;2.759414;101.731778
18 November 2018;19 - Gran Premio Motul de la Comunitat Valenciana;Circuit Ricardo Tormo SPAIN;39.486863;-0.629870
Loading