Skip to content

Commit

Permalink
Canadian mining updates: bug-fixes, more SSHRC matches, less IRSC fal…
Browse files Browse the repository at this point in the history
…se-positives, more relation terms.
  • Loading branch information
LSmyrnaios committed Jul 24, 2020
1 parent ebfd135 commit 4f36049
Showing 1 changed file with 18 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -67,56 +67,57 @@ union all
select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', textsnippet) as C1, docid, id, fundingclass1, grantid
from (
select docid, case when regexprmatches(".*(?:(?:CIHR|IRSC)|(?i)(?:canad(?:ian|a) institute(?:s)? health research|institut(?:(?:e)?(?:s)?)? recherche sant(?:é|e) canada)).*", prev||" "||middle||" "||next) then (select id from grants where fundingclass1 = 'CIHR')
when regexprmatches(".*(?:(?:NSERC|CRSNG)|(?i)(?:nat(?:ural|ional) science(?:s)?(?:\sengineering(?:\sresearch)?|\sresearch)? co(?:u)?n(?:c|se)(?:i)?l|conseil(?:s)? recherche(?:s)? science(?:s)? naturel(?:les)?(?:\sg(?:e|é)nie)? canada)).*", prev||" "||middle||" "||next) then (select id from grants where fundingclass1 = 'NSERC')
when regexprmatches(".*(?:(?:SSHRC|CRSH)|(?i)(?:social sciences humanities|conseil(?:s)? recherche(?:s)?(?:\ssciences humaines)? canada)).*", prev||" "||middle||" "||next) then (select id from grants where fundingclass1 = 'SSHRC')
when regexprmatches(".*(?:(?:NSERC|CRSNG)|(?i)(?:nat(?:ural|ional) science(?:s)?(?:\sengineering(?:\sresearch)?|\sresearch) co(?:u)?n(?:c|se)(?:i)?l|conseil(?:s)? recherche(?:s)? science(?:s)? naturel(?:les)?(?:\sg(?:e|é)nie)? canada)).*", prev||" "||middle||" "||next) then (select id from grants where fundingclass1 = 'NSERC')
when regexprmatches(".*(?:(?:SSHRC|CRSH|SSRCC)|(?i)(?:social science(?:s)?|conseil(?:s)? recherche(?:s)?(?:\ssciences humaines)? canada|humanities\sresearch)).*", prev||" "||middle||" "||next) then (select id from grants where fundingclass1 = 'SSHRC')
else 'canadian_unspecified_id'
end as id, "unidentified" as grantid, "Canadian" as fundingclass1, (prev||" "||middle||" "||next) as textsnippet
from
(setschema 'docid,prev,middle,next' select c1, textwindow2s(filterstopwords(keywords(c2)), 15,1,15, "^(?:(?:(?:CIHR|IRSC)|(?:NSERC|CRSNG)|(?:SSHRC|CRSH))|(?i)(?:co(?:(?:un(?:cil|sel))|(?:nseil(?:s)?))|canad(?:a|ian)))$") from pubs where c2 is not null)
where
( /* Terms */
/* Acronyms */
regexprmatches("^(?:(?:CIHR|IRSC)|(?:NSERC|CRSNG)|(?:SSHRC|CRSH))$", middle)
regexprmatches("^(?:CIHR|(?:NSERC|CRSNG)|(?:SSHRC|CRSH|SSRCC))$", middle)
or (
regexprmatches("^IRSC$", middle) /* This is the french acronym of CIHR. It also refers to some other organizations, so we search and exclude them. */
and not regexprmatches(".*(?:informal relationships social capital|interlocus sexual conflict|international (?:rosaceae|rosbreed) snp consortium|iranian seismological cent(?:er|re)).*", lower(prev||" "||next))
)
or (/* Full-names */
( /* Middle: "Council", "Counsel", "Conseil", "Conseils" --> NSERC/CRSNG, SSHRC/CRSH */
( /* Middle: "Council", "Counsel", "Conseil", "Conseils" --> NSERC/CRSNG, SSHRC/CRSH/SSRCC */
regexprmatches("^co(?:(?:un(?:cil|sel))|(?:nseil(?:s)?))$", lower(middle))
and (
-- The "middle" at the beginning of the fullname.
( regexprmatches("^recherche(?:s)?(?:(?:\s(?:g(?:e|é)nie|science(?:s)?)(?:\s(?:humaines|naturel(?:les)?)?)?(?:\sg(?:e|é)nie)?)?)?\scanada.*", lower(next)) -- The term "canada" is put as mandatory here, as we get false-positives.
or
regexprmatches("^social\ssciences\shumanities\sresearch\scanada.*", lower(next)) -- It has the word "canada", so it's a canadian match.
or regexprmatches("^social\sscience(?:s)?\shumanities\sresearch\scanada.*", lower(next)) -- It has the word "canada", so it's a canadian match.
)
or -- The "middle" at the end of the fullname.
(
regexprmatches(".*(?:social|nat(?:ural|ional))\sscience(?:s)?\s(?:(?:engineering|humanities)(?:\sresearch)?|research)$", lower(prev))
and -- Add this just to be more sure it's a "canadian" match..
(
( regexprmatches(".*(?:social|nat(?:ural|ional))\sscience(?:s)?\s(?:(?:engineering|humanities)(?:\sresearch)?|research)$", lower(prev))
or regexprmatches(".*humanities\sresearch$", lower(prev))
)
and ( -- Add this just to be more sure it's a "canadian" match..
regexprmatches("^canada.*", lower(next))
or
regexprmatches(".*canada\s(?:social|nat(?:ural|ional)).*", lower(prev))
or regexprmatches(".*canada\s(?:social|nat(?:ural|ional)).*", lower(prev))
)
)
)
)
or ( /* Middle: "Canada", "Canadian" --> CIHR/IRSC */
regexprmatches("^canad(?:a|ian)$", lower(middle)) -- "Canadian" match for sure.
and (
-- The "middle" at the beginning of the fullname
and ( -- The "middle" at the beginning of the fullname
regexprmatches("^institute(?:s)?\shealth\sresearch.*", lower(next))
or
-- The "middle" at the end of the fullname
or -- The "middle" at the end of the fullname
regexprmatches(".*institut(?:(?:e)?(?:s)?)?\srecherche\ssant(?:e|é)$", lower(prev))
)
)
)
)
and ( /* Relation */
regexprmatches(".*(?:fund|support|financ|grant|sponsor|parrain|souten|subsidiz|promot|acquir|acknowledg|administ|assist|donor|bailleur|g(?:e|é)n(?:e|é)rosit).*", lower(prev||" "||next))
regexprmatches(".*(?:fund|support|financ|proje(?:c)?t|grant|subvention|sponsor|parrain|souten|subsidiz|promot|acquir|acknowledg|administ|assist|donor|bailleur|g(?:e|é)n(?:e|é)rosit).*", lower(prev||" "||next))
or regexprmatches(".*(?:thank|gratefull|(?:re)?merci).*", lower(prev))
)
)
where id is not null
group by docid,id
group by docid, id


union all
Expand Down

0 comments on commit 4f36049

Please sign in to comment.