Skip to content

Commit

Permalink
Start of species synonyms. Needs work for subspecies etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
pengowray committed Jan 17, 2015
1 parent 2c16bb7 commit cbd5216
Show file tree
Hide file tree
Showing 25 changed files with 6,113 additions and 1,412 deletions.
384 changes: 199 additions & 185 deletions CatalogOfLife/CatalogueOfLifeDatabase.cs
Original file line number Diff line number Diff line change
@@ -1,185 +1,199 @@
//------------------------------------------------------------------------------
//-- to start database:
// -- "D:\Program Files (x86)\Catalogue of Life\2013 Annual Checklist\server\mysql\bin\mysqld"
// -- or
// -- "D:\Program Files (x86)\Catalogue of Life\2013 Annual Checklist\Server2Go.exe"
// --
// -- troubleshooting: make sure temp dir listed in "D:\Program Files (x86)\Catalogue of Life\2013 Annual Checklist\server\mysql\my.ini". e.g. mkdir "C:\Users\pengo\AppData\Local\Temp\Server2Go_11948"
// --
// -- workbench or API: connect to 127.0.0.1 port 7188, username: root (default)
//
// -- to import wiktionary database files (example):
// -- cd /d "D:\Program Files (x86)\Catalogue of Life\2013 Annual Checklist\server\mysql\bin\"
// -- gzip -cd D:\ngrams\datasets-wiki\enwiktionary-20140328-categorylinks.sql.gz | mysql --port=7188 --user=root --database=enwiktionary
// -- gzip -cd c:\temp\enwiktionary-20140328-*.sql.gz | mysql --port=7188 --user=root --database=enwiktionary

//-----------------------------------------------------------

using System;
using MySql.Data.MySqlClient;

namespace beastie
{
public class CatalogueOfLifeDatabase {
//private MySqlConnection connection;

public bool dontStartMysqld = false;
public string port = null;
public string year = null;
private bool mysqldStarted = false;

private static CatalogueOfLifeDatabase _instance;
public static CatalogueOfLifeDatabase Instance() {
if (_instance == null) {
_instance = new CatalogueOfLifeDatabase();
}
return _instance;
}


private const string query_CreateDatabaseBeastie =
@"CREATE DATABASE IF NOT EXISTS beastie
CHARACTER SET utf8
DEFAULT COLLATE utf8_general_ci;";

private const string query_CreateAndUseDatabaseWiktionary =
@"CREATE DATABASE IF NOT EXISTS enwiktionary
CHARACTER SET binary;
USE enwiktionary;";

public CatalogueOfLifeDatabase () {
}

public MySqlConnection Connection() {
if (!mysqldStarted && !dontStartMysqld) {
var mysqld = new RunMysqld();
// TODO: check if it's already running I guess?
mysqld.StartDatabase();
port = mysqld.port;
year = mysqld.year; //TODO: should actually be the other way around
mysqldStarted = true;
}

if (port == null || port == "") {
port = "7188"; // default for CatalogueOfLife
}

if (year == null || year == "") {
year = "2014";
}

string server = "127.0.0.1"; // "localhost";
//string port = "7188"; // or 3306, or 7189
string database = "";
string uid = "root";
string password = "";
string connectionString =
"SERVER=" + server + ";" +
"PORT=" + port + ";" +
"DATABASE=" + database + ";" +
"UID=" + uid + ";" +
"PASSWORD=" + password + ";";

MySqlConnection connection = new MySqlConnection(connectionString);
//connection.ConnectionTimeout

connection.Open();

//TODO: check for errors

return connection;
}

public void CreateBeastieDatabase() {
using (MySqlConnection connection = Connection())
using (MySqlCommand command = connection.CreateCommand()) {

command.CommandText = query_CreateDatabaseBeastie;
Console.WriteLine("Checking for / creating Beastie database...");
int result = command.ExecuteNonQuery();
Console.WriteLine("OK");
}
}

public void CreateWiktionaryDatabase() {
using (MySqlConnection connection = Connection())
using (MySqlCommand command = connection.CreateCommand()) {

command.CommandText = query_CreateAndUseDatabaseWiktionary;
Console.WriteLine("Checking for / creating Wiktionary database...");
int result = command.ExecuteNonQuery();
Console.WriteLine("OK");
}
}

public void BuildSpeciesTable() {
// build species data from Collection of Life
// see also: mysql-pengo-2013.txt

//-- Main cleaned up list of species. To be the basis of many other queries
//-- It does not contain synonyms (because it doesn't find any, TODO: add synonyms list).
//-- Genus + epithet trimmed of spaces
//-- Trailing commas or dots have been removed.
//-- Genus capitalized
//-- Virus entries removed (Epithets with spaces which contain any of the following words: virus, viroid, phage)
//-- TODO: remove duplicates (except we'd lose the taxon id of the dupes.. so make new version without IDs and no dupes)


string query_CreateSpeciesViewAndTable = @"
DROP VIEW IF EXISTS beastie.view_col_species;
CREATE VIEW beastie.view_col_species AS
SELECT
CONCAT(UPPER(LEFT(TRIM(genus_word.name_element),1)), MID(TRIM(genus_word.name_element),2)) as genus, -- capitalize first letter of genus
TRIM(TRIM(TRAILING ',' FROM TRIM(TRAILING '.' FROM `epithet_word`.`name_element`))) as epithet, -- trim trailing , or .
taxon.id as taxon_id,
taxon_detail.scientific_name_status_id
FROM taxon
LEFT JOIN taxon_name_element AS epithet_element ON (epithet_element.taxon_id = taxon.id)
LEFT JOIN scientific_name_element AS epithet_word ON (epithet_element.scientific_name_element_id = epithet_word.id)
LEFT JOIN taxon AS genus_taxon ON (epithet_element.parent_id = genus_taxon.id)
LEFT JOIN taxon_name_element AS genus_element ON (genus_element.taxon_id = genus_taxon.id)
LEFT JOIN scientific_name_element AS genus_word ON (genus_element.scientific_name_element_id = genus_word.id)
LEFT JOIN taxon_detail ON (taxon.id = taxon_detail.taxon_id)
WHERE
taxon.taxonomic_rank_id = 83
AND genus_taxon.taxonomic_rank_id = 20 -- not strictly needed as it's always 20.
AND (taxon_detail.scientific_name_status_id = 1 OR taxon_detail.scientific_name_status_id = 4 OR taxon_detail.scientific_name_status_id = 5) -- NOT: ambiguous syn, misapplied name
-- taxon_detail.scientific_name_status_id = 1 (accepted), 2=ambiguous syn, 3=misapplied name, 4=provisionally accepted name, 5=synonym
-- note: only 1 and 4 found (not 5 as that's in other tables)
AND (LOCATE(' ', TRIM(`epithet_word`.`name_element`)) = 0
OR (locate('virus', `epithet_word`.`name_element`) = 0
and locate('viroid', `epithet_word`.`name_element`) = 0
and locate('phage', `epithet_word`.`name_element`) = 0))
ORDER BY
genus, epithet, taxon_id, scientific_name_status_id;
-- materialized view of above
DROP TABLE IF EXISTS beastie._col_species;
CREATE TABLE beastie._col_species SELECT * FROM beastie.view_col_species LIMIT 0, 10000000;
ALTER TABLE beastie._col_species
ADD PRIMARY KEY (`taxon_id`),
ADD UNIQUE INDEX `taxon_id_UNIQUE` (`taxon_id` ASC);
";

CreateBeastieDatabase();

using (MySqlConnection connection = Connection()) {
Console.WriteLine("Connecting to CoL database...");
using (MySqlCommand command = connection.CreateCommand()) {
//MySqlCommand cmd = new MySqlCommand(query, connection);
query_CreateSpeciesViewAndTable = "USE col" + year + "ac; " + query_CreateSpeciesViewAndTable;
command.CommandTimeout = 900; // 900 = 15 minutes. Should hopefully be done by then.
Console.WriteLine("Creating materialized Species table. This could take a while...");
command.CommandText = query_CreateSpeciesViewAndTable;
int result = command.ExecuteNonQuery();
}
}
}




}
}

//------------------------------------------------------------------------------
//-- to start database:
// -- beastie.exe mysqld
// --
// -- old way:
// -- "D:\Program Files (x86)\Catalogue of Life\2013 Annual Checklist\server\mysql\bin\mysqld"
// -- or
// -- "D:\Program Files (x86)\Catalogue of Life\2013 Annual Checklist\Server2Go.exe"
// --
// -- troubleshooting: make sure temp dir listed in "D:\Program Files (x86)\Catalogue of Life\2013 Annual Checklist\server\mysql\my.ini". e.g. mkdir "C:\Users\pengo\AppData\Local\Temp\Server2Go_11948"
// --
// -- workbench or API: connect to 127.0.0.1 port 7188, username: root (default)
//
// -- to import wiktionary database files (example):
// -- cd /d "D:\Program Files (x86)\Catalogue of Life\2013 Annual Checklist\server\mysql\bin\"
// -- gzip -cd D:\ngrams\datasets-wiki\enwiktionary-20140328-categorylinks.sql.gz | mysql --port=7188 --user=root --database=enwiktionary
// -- gzip -cd c:\temp\enwiktionary-20140328-*.sql.gz | mysql --port=7188 --user=root --database=enwiktionary

//-----------------------------------------------------------

using System;
using System.Data;
using System.Collections.Generic;
using MySql.Data.MySqlClient;

namespace beastie
{
public class CatalogueOfLifeDatabase {
//private MySqlConnection connection;

public bool dontStartMysqld = false;
public string port = null;
public string year = null; // CoL database year
private bool mysqldStarted = false;

private static CatalogueOfLifeDatabase _instance;
public static CatalogueOfLifeDatabase Instance() {
if (_instance == null) {
_instance = new CatalogueOfLifeDatabase();
}
return _instance;
}

public string DatabaseName() {
return "col" + year + "ac";
}

private const string query_CreateDatabaseBeastie =
@"CREATE DATABASE IF NOT EXISTS beastie
CHARACTER SET utf8
DEFAULT COLLATE utf8_general_ci;";

private const string query_CreateAndUseDatabaseWiktionary =
@"CREATE DATABASE IF NOT EXISTS enwiktionary
CHARACTER SET binary;
USE enwiktionary;";

public CatalogueOfLifeDatabase () {
}

public MySqlConnection Connection() {
if (!mysqldStarted && !dontStartMysqld) {
var mysqld = new RunMysqld();
// TODO: check if it's already running I guess?
mysqld.StartDatabase();
port = mysqld.port;
year = mysqld.year; //TODO: should actually be the other way around
mysqldStarted = true;
}

if (port == null || port == "") {
port = "7188"; // default for CatalogueOfLife
}

if (year == null || year == "") {
year = "2014";
}

string server = "127.0.0.1"; // "localhost";
//string port = "7188"; // or 3306, or 7189
string database = "";
string uid = "root";
string password = "";
string connectionString =
"SERVER=" + server + ";" +
"PORT=" + port + ";" +
"DATABASE=" + database + ";" +
"UID=" + uid + ";" +
"PASSWORD=" + password + ";";

MySqlConnection connection = new MySqlConnection(connectionString);
//connection.ConnectionTimeout

connection.Open();

//TODO: check for errors

return connection;
}

public void CreateBeastieDatabase() {
using (MySqlConnection connection = Connection())
using (MySqlCommand command = connection.CreateCommand()) {

command.CommandText = query_CreateDatabaseBeastie;
Console.WriteLine("Checking for / creating Beastie database...");
int result = command.ExecuteNonQuery();
Console.WriteLine("OK");
}
}

public void CreateWiktionaryDatabase() {
using (MySqlConnection connection = Connection())
using (MySqlCommand command = connection.CreateCommand()) {

command.CommandText = query_CreateAndUseDatabaseWiktionary;
Console.WriteLine("Checking for / creating Wiktionary database...");
int result = command.ExecuteNonQuery();
Console.WriteLine("OK");
}
}

public void BuildSpeciesTable() {
// build species data from Collection of Life
// see also: mysql-pengo-2013.txt

//-- Main cleaned up list of species. To be the basis of many other queries
//-- It does not contain synonyms (because it doesn't find any, TODO: add synonyms list).
//-- Genus + epithet trimmed of spaces
//-- Trailing commas or dots have been removed.
//-- Genus capitalized
//-- Virus entries removed (Epithets with spaces which contain any of the following words: virus, viroid, phage)
//-- TODO: remove duplicates (except we'd lose the taxon id of the dupes.. so make new version without IDs and no dupes)

// TODO: instead just use col2014ac._search_scientific table. much simpler. no joins.

string query_CreateSpeciesViewAndTable = @"
DROP VIEW IF EXISTS beastie.view_col_species;
CREATE VIEW beastie.view_col_species AS
SELECT
CONCAT(UPPER(LEFT(TRIM(genus_word.name_element),1)), MID(TRIM(genus_word.name_element),2)) as genus, -- capitalize first letter of genus
TRIM(TRIM(TRAILING ',' FROM TRIM(TRAILING '.' FROM `epithet_word`.`name_element`))) as epithet, -- trim trailing , or .
taxon.id as taxon_id,
taxon_detail.scientific_name_status_id
FROM taxon
LEFT JOIN taxon_name_element AS epithet_element ON (epithet_element.taxon_id = taxon.id)
LEFT JOIN scientific_name_element AS epithet_word ON (epithet_element.scientific_name_element_id = epithet_word.id)
LEFT JOIN taxon AS genus_taxon ON (epithet_element.parent_id = genus_taxon.id)
LEFT JOIN taxon_name_element AS genus_element ON (genus_element.taxon_id = genus_taxon.id)
LEFT JOIN scientific_name_element AS genus_word ON (genus_element.scientific_name_element_id = genus_word.id)
LEFT JOIN taxon_detail ON (taxon.id = taxon_detail.taxon_id)
WHERE
taxon.taxonomic_rank_id = 83
AND genus_taxon.taxonomic_rank_id = 20 -- not strictly needed as it's always 20.
AND (taxon_detail.scientific_name_status_id = 1 OR taxon_detail.scientific_name_status_id = 4 OR taxon_detail.scientific_name_status_id = 5) -- NOT: ambiguous syn, misapplied name
-- taxon_detail.scientific_name_status_id = 1 (accepted), 2=ambiguous syn, 3=misapplied name, 4=provisionally accepted name, 5=synonym
-- note: only 1 and 4 found (not 5 as that's in other tables)
AND (LOCATE(' ', TRIM(`epithet_word`.`name_element`)) = 0
OR (locate('virus', `epithet_word`.`name_element`) = 0
and locate('viroid', `epithet_word`.`name_element`) = 0
and locate('phage', `epithet_word`.`name_element`) = 0))
ORDER BY
genus, epithet, taxon_id, scientific_name_status_id;
-- materialized view of above
DROP TABLE IF EXISTS beastie._col_species;
CREATE TABLE beastie._col_species SELECT * FROM beastie.view_col_species LIMIT 0, 10000000;
ALTER TABLE beastie._col_species
ADD PRIMARY KEY (`taxon_id`),
ADD UNIQUE INDEX `taxon_id_UNIQUE` (`taxon_id` ASC);
";

CreateBeastieDatabase();

using (MySqlConnection connection = Connection()) {
Console.WriteLine("Connecting to CoL database...");
using (MySqlCommand command = connection.CreateCommand()) {
//MySqlCommand cmd = new MySqlCommand(query, connection);
query_CreateSpeciesViewAndTable = "USE col" + year + "ac; " + query_CreateSpeciesViewAndTable;
command.CommandTimeout = 900; // 900 = 15 minutes. Should hopefully be done by then.
Console.WriteLine("Creating materialized Species table. This could take a while...");
command.CommandText = query_CreateSpeciesViewAndTable;
int result = command.ExecuteNonQuery();
}
}
}

public Dictionary<string,string> BranchOfLife(Species species) {
//Connection().
//var results = from r in

return null;
}


}
}

Loading

0 comments on commit cbd5216

Please sign in to comment.