diff --git a/.gitignore b/.gitignore index 3b66d17e..6fbb068e 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,7 @@ target/ *.iml *.ipr *.iws -/.idea/ \ No newline at end of file +/.idea/ + +.DS_Store + diff --git a/duke-core/.idea/compiler.xml b/duke-core/.idea/compiler.xml new file mode 100644 index 00000000..96cc43ef --- /dev/null +++ b/duke-core/.idea/compiler.xml @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/duke-core/.idea/copyright/profiles_settings.xml b/duke-core/.idea/copyright/profiles_settings.xml new file mode 100644 index 00000000..e7bedf33 --- /dev/null +++ b/duke-core/.idea/copyright/profiles_settings.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/duke-core/.idea/misc.xml b/duke-core/.idea/misc.xml new file mode 100644 index 00000000..8d8dc8d4 --- /dev/null +++ b/duke-core/.idea/misc.xml @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + 1.8 + + + + + + + + \ No newline at end of file diff --git a/duke-core/.idea/modules.xml b/duke-core/.idea/modules.xml new file mode 100644 index 00000000..211fc4dc --- /dev/null +++ b/duke-core/.idea/modules.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/duke-core/.idea/workspace.xml b/duke-core/.idea/workspace.xml new file mode 100644 index 00000000..c65a71bb --- /dev/null +++ b/duke-core/.idea/workspace.xml @@ -0,0 +1,562 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1469218236929 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/duke-core/src/main/java/no/priv/garshol/duke/cleaners/StopwordsCleaner.java b/duke-core/src/main/java/no/priv/garshol/duke/cleaners/StopwordsCleaner.java new file mode 100644 index 00000000..68b3bf9b --- /dev/null +++ b/duke-core/src/main/java/no/priv/garshol/duke/cleaners/StopwordsCleaner.java @@ -0,0 +1,63 @@ + +package no.priv.garshol.duke.cleaners; + +import no.priv.garshol.duke.Cleaner; + +import java.io.*; +import java.util.List; +import java.util.ArrayList; + +/** + * A cleaner which removes english stopwords from a string. + */ + +public class StopwordsCleaner implements Cleaner { + private LowerCaseNormalizeCleaner sub; + HashSet stopwords = new HashSet(); + private ArrayList wordsList = new ArrayList(); + + + public StopwordsCleaner() { + this.sub = new LowerCaseNormalizeCleaner(); + + try { + this.stopwords = loadStopwords(); + } catch (DukeException e) { + throw new RuntimeException(e); + } + } + + + public String clean(String value) { + + value = sub.clean(value); + if (value == null || value.equals("")) + return value; + + + for (String word : words) { + if (!stopwords.contains(word)) + wordsList.add(word); + } + + return String.join(" ",wordsList); + + } + + private HashSet loadStopwords() throws IOException { + String mapfile = "no/priv/garshol/duke/english-stopwords.txt"; + + BufferedReader in = new BufferedReader(new FileReader(mapfile)); + String str; + + HashSet stopwords = new HashSet(); + while((str = in.readLine()) != null){ + stopwords.add(str); + } + + in.close(); + return stopwords; + } + +} + diff --git a/duke-core/src/main/resources/no/priv/garshol/duke/english-stopwords.txt b/duke-core/src/main/resources/no/priv/garshol/duke/english-stopwords.txt new file mode 100644 index 00000000..31e2514c --- /dev/null +++ b/duke-core/src/main/resources/no/priv/garshol/duke/english-stopwords.txt @@ -0,0 +1,496 @@ +a +able +about +above +abst +accordance +according +accordingly +across +act +actually +added +adj +affected +affecting +affects +after +afterwards +again +against +ah +all +almost +alone +along +already +also +although +always +am +among +amongst +an +and +announce +another +any +anybody +anyhow +anymore +anyone +anything +anyway +anyways +anywhere +apparently +approximately +are +aren +arent +arise +around +as +aside +ask +asking +at +auth +available +away +awfully +b +back +be +became +because +become +becomes +becoming +been +before +beforehand +begin +beginning +beginnings +begins +behind +being +believe +below +beside +besides +between +beyond +biol +both +brief +briefly +but +by +c +ca +came +can +cannot +can't +cause +causes +certain +certainly +co +com +come +comes +contain +containing +contains +could +couldnt +d +date +did +didn't +different +do +does +doesn't +doing +done +don't +down +downwards +due +during +e +each +ed +edu +effect +eg +eight +eighty +either +else +elsewhere +end +ending +enough +especially +et +et-al +etc +even +ever +every +everybody +everyone +everything +everywhere +ex +except +f +far +few +ff +fifth +first +five +fix +followed +following +follows +for +former +formerly +forth +found +four +from +further +furthermore +g +gave +get +gets +getting +give +given +gives +giving +go +goes +gone +got +gotten +h +had +happens +hardly +has +hasn't +have +haven't +having +he +hed +hence +her +here +hereafter +hereby +herein +heres +hereupon +hers +herself +hes +hi +hid +him +himself +his +hither +home +how +howbeit +however +hundred +i +id +ie +if +i'll +im +immediate +immediately +importance +important +in +inc +indeed +index +information +instead +into +invention +inward +is +isn't +it +itd +it'll +its +itself +i've +j +just +k +keep +keeps +kept +kg +km +know +known +knows +l +largely +last +lately +later +latter +latterly +least +less +lest +let +lets +like +liked +likely +line +little +'ll +look +looking +looks +ltd +m +made +mainly +make +makes +many +may +maybe +me +mean +means +meantime +meanwhile +merely +mg +might +million +miss +ml +more +moreover +most +mostly +mr +mrs +much +mug +must +my +myself +n +na +name +namely +nay +nd +near +nearly +necessarily +necessary +need +needs +neither +never +nevertheless +new +next +nine +ninety +no +nobody +non +none +nonetheless +noone +nor +normally +nos +not +noted +nothing +now +nowhere +o +obtain +obtained +obviously +of +off +often +oh +ok +okay +old +omitted +on +once +one +ones +only +onto +or +ord +other +others +otherwise +ought +our +ours +ourselves +out +outside +over +overall +owing +own +p +page +pages +part +particular +particularly +past +per +perhaps +placed +please +plus +poorly +possible +possibly +potentially +pp +predominantly +present +previously +primarily +probably +promptly +proud +provides +put +q +que +quickly +quite +qv +r +ran +rather +rd +re +readily +really +recent +recently +ref +refs +regarding +regardless +regards +related +relatively +research +respectively +resulted +resulting +results +right +run +s +said +same +saw +say +saying +says +sec +section +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sent +seven +several +shall +she +shed +she'll +shes +should +shouldn't +show +showed +shown +showns +shows +significant +significantly +similar +similarly +since +six +slightly +so +some +somebody +somehow +someone +somethan +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specifically +specified +specify +specifying +still +stop +strongly +sub +substantially +successfully +such +sufficiently +suggest +sup +sure \ No newline at end of file diff --git a/duke-core/src/test/java/no/priv/garshol/duke/cleaners/StopwordsCleanerTest.java b/duke-core/src/test/java/no/priv/garshol/duke/cleaners/StopwordsCleanerTest.java new file mode 100644 index 00000000..f1ce7f8b --- /dev/null +++ b/duke-core/src/test/java/no/priv/garshol/duke/cleaners/StopwordsCleanerTest.java @@ -0,0 +1,32 @@ + +package no.priv.garshol.duke.cleaners; + +import org.junit.Before; +import org.junit.Test; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertTrue; + +public class StopwordsCleanerTest extends LowerCaseNormalizeCleanerTest { + + public void setUp() { + cleaner = new StopwordsCleaner(); + } + + @Test + public void testMapping() { + assertEquals("Hello my name is duke", cleaner.clean("hello name duke")); + } + + @Test + public void testEmpty() { + assertTrue(cleaner.clean("") == ""); + } + + @Test + public void testNull() { + assertTrue(cleaner.clean(null) == null); + } + + +} \ No newline at end of file