From 8902e952785067d067ac3c0004c1ef71c6ea5756 Mon Sep 17 00:00:00 2001 From: Matthew Bernhardt Date: Tue, 6 Aug 2024 14:11:07 -0400 Subject: [PATCH] Continued work toward rake task with tests This adds a bulk_reload method to the SR model, which the task calls after it has laid the foundation. The task's concern are to receive the argument and parse the file initially into a CSV::table object. Once the parsing is done, then the work passes to the SR model for processing. The remaining work is two-fold: 1. The two pathways for receiving a local file and a remote url are not quite working the same - field headers are specified in different ways between the URI and CSV libraries, and we either need them to yield comparable outcomes or to build two different bulk_replace methods (ick). 2. Tests for the happy path are nearly done, but there are a lot of boundary conditions that still need tests off the happy path. Part of that is knowing where we're going to get the URL-based file from (right now I'm using a local Lando, but that's not great) --- app/models/detector/suggested_resource.rb | 18 ++++++ lib/tasks/suggested_resources.rake | 34 ++++++----- test/fixtures/files/suggested_resources.csv | 3 + test/fixtures/files/suggested_resources.xlsx | Bin 0 -> 9466 bytes .../files/suggested_resources_extra.csv | 3 + .../suggested_resources_missing_field.csv | 3 + .../suggested_resources_wrong_columns.csv | 2 + test/tasks/suggested_resource_rake_test.rb | 53 ++++++++++++++++++ test/vcr_cassettes/remote_csv.yml | 40 +++++++++++++ 9 files changed, 143 insertions(+), 13 deletions(-) create mode 100644 test/fixtures/files/suggested_resources.csv create mode 100644 test/fixtures/files/suggested_resources.xlsx create mode 100644 test/fixtures/files/suggested_resources_extra.csv create mode 100644 test/fixtures/files/suggested_resources_missing_field.csv create mode 100644 test/fixtures/files/suggested_resources_wrong_columns.csv create mode 100644 test/tasks/suggested_resource_rake_test.rb create mode 100644 test/vcr_cassettes/remote_csv.yml diff --git a/app/models/detector/suggested_resource.rb b/app/models/detector/suggested_resource.rb index 526074e..b927fe2 100644 --- a/app/models/detector/suggested_resource.rb +++ b/app/models/detector/suggested_resource.rb @@ -53,5 +53,23 @@ def calculate_fingerprint(old_phrase) # Rejoin tokens tokens.join(' ') end + + # This accepts an array of values and saves them all as new records. It is + # called by the suggested_resources:reload rake task. + def self.bulk_replace(input) + raise ArgumentError.new, 'Tabular CSV is required' unless input.instance_of?(CSV::Table) + + # Need to check what columns exist in input + required_headers = %i[title url phrase] + missing_headers = required_headers - input.headers + raise ArgumentError.new, "Some CSV columns missing: #{missing_headers}" unless missing_headers.empty? + + Detector::SuggestedResource.delete_all + + input.each do |line| + record = Detector::SuggestedResource.new({ title: line[:title], url: line[:url], phrase: line[:phrase] }) + record.save + end + end end end diff --git a/lib/tasks/suggested_resources.rake b/lib/tasks/suggested_resources.rake index fbab52a..2fe87da 100644 --- a/lib/tasks/suggested_resources.rake +++ b/lib/tasks/suggested_resources.rake @@ -12,26 +12,34 @@ namespace :suggested_resources do Rails.logger.info("Record count before we reload: #{Detector::SuggestedResource.count}") if URI(args.addr).scheme - Rails.logger.info("Loading from remote address: #{args.addr}") url = URI.parse(args.addr) raise ArgumentError.new, 'HTTP/HTTPS scheme is required' unless url.scheme.in?(%w[http https]) - Rails.logger.info(url) - file = url.read - Rails.logger.info(file) - # Need to connect to a CSV content type - # Invalid parsing should... do something? + data = csv_table_from_url_direct(url) else - Rails.logger.info("Loading from local file: #{args.addr}") - file = File.read(args.addr) - Rails.logger.info(file) + file = File.open(args.addr) # Invalid / not found file should ... do something? + data = CSV.table(file) end - Rails.logger.info('Now ready to parse a CSV') - data = CSV.parse(file) - Rails.logger.info(data) + Detector::SuggestedResource.bulk_replace(data) - # Rails.logger.info("Record count after we reload: #{Detector::SuggestedResource.count}") + Rails.logger.info("Record count after we reload: #{Detector::SuggestedResource.count}") + end + + def csv_table_from_url_direct(url) + file = url.open.read.force_encoding('UTF-8').encode + csv = CSV.parse(file, headers: true) + end + + def csv_table_from_url_rebuild(url) + file = url.read + all_rows = CSV.new(file).read + header = [] + all_rows[0].each { |field| header.push(field.strip.downcase.gsub("\xEF\xBB\xBF".force_encoding("UTF-8"), '').to_sym) } + value_rows = all_rows.length - 1 + values = all_rows[1..value_rows] + rebuild = values.map { |row| CSV::Row.new(header, row) } + CSV::Table.new(rebuild) end end diff --git a/test/fixtures/files/suggested_resources.csv b/test/fixtures/files/suggested_resources.csv new file mode 100644 index 0000000..f0bec40 --- /dev/null +++ b/test/fixtures/files/suggested_resources.csv @@ -0,0 +1,3 @@ +Title,URL,Phrase +New Example,https://example.org,new example search +Web of Science,https://libraries.mit.edu/webofsci,web of Science \ No newline at end of file diff --git a/test/fixtures/files/suggested_resources.xlsx b/test/fixtures/files/suggested_resources.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..819974a0172e233ade2c11f2247bdb3b44784c4a GIT binary patch literal 9466 zcmeHN^Z`pk%<7P0CWHVKnpP5O}EfT0svx>0RTb( z`kfb#?d_bwcFqPG9u8p0OHOxNoBJP;?=WWq?jXMZ-}PTS1BLN@PusciWKZR<9w{*Pi%r|+FVQC4^2*1RW{y=OcSVk~7E7YR0V?yO^mpzQOf-HLmCpy93tUqj##_MUJW$<~m zv(UXM>a{(z`XXreAaT==_XZUJfWv_R)xW{AN|T%Z2oY;a2-IOAU}@k4wt;YQ{&xQ# z9RG_k_?Mv<#wn|`b7P0@$zQ$dKK(iyizf|olagk(j-SBM~O7v^|nc+zl}{|5ohXZ9{u}PNDhCLfEp@IKxSV&cndersYs3wVCkR zo8xI(w>ljk!5Q!1lP<>0V?W(WzJtDWj}AeM_JZ#Hp1??&+&Kh8{Sze1^^K)|KmcF_ zL1nmznQ^z_a zv&`LrQH*}dUu)ju$FLM7k19`(3X|aKTA%YO7KT^aS}lk7L`Mqn_K*wUjIVuM*m-Q~ zs{<=);S-Ua4jlgoeskV0aya^BuQ}a5*jo?dzycsSEo=69OhhKPk8ONtSE*13OZ0Or zjwW{kZte?^Tx7Aa57{&a`$Cul@%JDjxzDk9&6LCTqTnP zO7VvuWd<+mEXDz4OZ?h{z+6i{5BKvSOa*QJS}0>iS;GF={sS@!FENbj=x8fF zE~rBmPcFLs0`Y10q)Q(~w_3_`#%+ghQBglvfCm*`0Tbbva{F9a__5=e33!30Qha+Y z0|HzykbXr&x-(9_zF&3}Ov9Ffe@8>h;Y&n!T8oCwWCO~#lD8QXrJ()bTBdrT?sGW` z6gEqP0!DO?;20&!>ARg%1i|cTat<^Vd+fD+w|kOj>U?>r7RjXL9Wm z>!!SN{yM9wNe(TfC-m8D@ff>;-r*&d#nq#gg$&c8?0$^1*ThBV&i&v+ukEQv370D6 zn^2R)3%&Ps_UtkiU_u`Ku~BWr|6p|+lK=vGe_#22sIkG9q}W#rTUgaO`rik_5|nHv z%5#eCQD6)eR}9B#LIX?AicJBeBe)PLFU_52B|P+_A!0PPFGqK3A3IDQE*)QyHo|r4 z;l<>qO_nOhjBZ`uvb~7IGbpCL=17bQlcV>71P7PLsIyaz4l_ve?;I$3lPbAd`*Gi> zBAG0y9Mcg6nUN%OLEeAT!Sm8uV4D2FD_FI}@J=^eCA)gBBif_^NPiSM;*fr)>Vx3a z2V}crV$j0eI5e8`H8^)LHVK8iT(Bu|R0OSO%Z?ix{gc#-(xUqBV?#kxLPPYN(GT3e z>k2`{`f#!Z@LX)zT+Ch9l*@Gz-9;oWAb2-4NlW)H9Ijv%iNpz3ES5Lu2018fgi`@= zu6L2v%<+QIFpUp6uc*Xm_fQyV|7wI4zmoW|LQT>)+a|8E7s095?>uvA5AWY6X)O5`-g($Sh%-7k(YHltTT;F{ae1ECdEmku zsZtX|8hKU%K3R81zA%y;j&+#5%PX6u#Ce1AZ&(cwk54*8V6+}VXcPc+Bm`D}=d`~f z_D_C8LX?{b#Q%3+h3d+n4sJXs*3Bz!w^UbR{BO=&blaL+xad8VEYnQ14}Fj3DOu}Z zJ|72h0qsLQ_6E9KPxWQYkUa@)_C!(a2Mz~-=*Bxm)PYY>2nn{oZx!rd z-A#cI)``b;vu6+qt(`G5lE2Ezrjt3U8wusJf^oa>T0w^H^&;|nU*(GbMewujqQJ_U zn)@j%W~=bJ$I~aM?*lz(h7ZXx6vS{m0K*$&v}|XsvM1<28n4WhJz&(zU*abp8mn!ceH1=($n>I91xw^``rQvpTPkzf0RjN=HQ)a%P=n$lUnK`W6$r z{DRmh$6L9_%CLp*(I1C7Uxp{sw~)O3k$0>{yd*T@MdpHx?aWYzght5=-gT)oe6}lX znX-Hv#~1!Fjg^V74v+N`z1=x|p;W-TMDh}=BhZxk(hW*BlaPHzFZ5;GP9k_F7YRQq zw(b1`W(=Yd{pzXbPQtmTI}`-Yl&Zl2HiDu8;=qMY~O zv`T4eSZF-@3zV$q&p`)Qor-gYn`lPoI0Dp3MJFGNdgjHF{dz`n(U--ZwiotClqzKj z=?%W6)3xtChiF~V_sO#Dhd$8`2q|V7bgWWDf61ABbw%Ldyw@3EK7bqB@+ebIDUc^4 zil@PG5i607q297p-gG7sSSYE8Yy#OGl%>CivZ1_qeU@LHeCK)q6$etv`=iFfCnoM} zxpJrsdNe<9z!qk^SC*1>z$2{br606RregyZxHv-)G#vhOoy%EZ^2!{J3hWA%6)e46 z+RWveyc+7vobvr_sO5#P6Np%`l?mP!sk6RRk9_;=P|+p<`{;H2woROr;=KT?0J<-d zzRBqUQHWcVFRum+LkEVZ2_x5mp>yWoIwQ@H%P|N6b4hlwP58|H`;ojbi`$R~c)IJA zhpDn+&}n{4Du4WlQdBEupd^v3AK3j;A=Ru>21Pc6#3y)hl$`HL1|jiqb5g+YZdya_ zgYg&)OKua1oP!v0qf|*we>Kw-ingqJJw3h|-(Fb7HX1~wuj%=Ix)z#YqBP{5oga|Eh|KWg1d438nb zGCWMoNiJnEA_96N8E5Rq^>A$O(ajpPt4lBja{9L5-3Kf>hE;_Ltlcwex}MjAmgU=? zDoo3Y(~s2`O{2+GgWMksG>>IB$!?iYNrEd zh7GTHlg`>yO%G5(L)e8xo!dkQvgaw*_V1t!mqu|u`V#oS%AoMe89Av5z~X*;1e1Kx zv+|aQVN3e?wK9)cpM_3PD>}9@VqY5JNp5_-cDS@>kH$skRIt`>k=ezz6qkn~D{D@8 z0d%1RE31O@jqW$ck?U}egD<398E~w7!{AT6nu#;CPZPm)G&eV0@ato@jb`}GG_Ju- zPgD~4s=)8&=4;LTb-d?us}L6boAUx~1Ha=RTz+RaGzKD-r^p$Ji-Z(};ZM66J^M}E zB~36>C6Xgj6_n_dI54OJ>|T9;6T!aFL-CNbSjC1=oH!P&*K%24PBKv;Ur6NAf5jca zvB0xlI2bYtz&+Mu#8T1f8F^IqJi#%Br^#X9QA_yh!Z2&dk>X2MH2gA)4Bj;6Mg|Nu zd3fg_hNIZ5dAzRcqwI&Rj0Vg{fa&gL5(CGAK9i)}v5n1g;kytyy96nt@ zOWSrieWu9n)+~-M7J#gst$`eN17@UZYE-F(Qcu_fgtPS>$=HA=GBRNs8zqTa7Bu*6 z`-~Zcff?T44TskAz=wfHLFnbZw2mm_K%1`oCY7*YUWEz%29_UTifpEX-En)1kjAD& z_i>@{vdNrtTVrv^ zYO|n@Nh#<1S?8C5*mY&t3@3us1IQ1p1L+Kz-hHIku1CS|yABDP(~g?#WvMA8wOvGI zeqHGKAiHNP;yocda`Tmk)q7m6<67co5VV4;gi61_>15G2q-p@KB<|Ur=^neKdJbDV z5ak}csq$M*qCHm1bl_TbrXw`aCpIuU*C?}*MfXwm0pZ@xB|Ec;xz#}T8@?hMj1vAs z9_*BicuOU_OMgq`qzezjJ6JVDlil}QzuM1Ajh2z zQ^I>+M=xLZ^x^}GzPwa%;B32 zw9N+N71u+?a$>kZ9wV_jPT6bT$kf8O=MSg;hl3s34 z8L4QHlMB{~VC~h(g<;LsqR%p}iyjx#(U>I43C*kbeiI8y#Q}r5W^8R$>-*l5W7k`^ zZL05;?ltZ-`Id|)vYm82Ewj|Ml(kRYwYgYyJtrtICaro7uDkSv?byWKL(Ye!x}QRu zWs2VQf6OM|TQf?pJXA}a@+w4EY*2y3%vPrcj}q8790$!D=YAFK$S&mzu)hoIGoSXp zz>e#}dy22kF1lleF_S#P`-r%+=9##`dXkb^IMZPrr(%W1%{z9ybORUYkf!k3t1b`z z76QrC6jH58ts$MAZ-Jf_eXd#ZstUUhW$R(t7cXs}j=2=snT_A= z%b5CPWWrKARYh0XgAzI-EZRTd#I=&lmY_Z55TX_63Z1`*()Ym=KqYRTCDZ&~Rc~M^ z=vk#$GMtN&CAWZkxwtN=BI`KR-Q-QyZi)9|eO}_-LAw3@rGbEnbn=*j8SiCP{yD*U zN#w;F`}-|#B-yA?j!Q}JvIZFq6^^wVRgBnu^-PeOqzZjBS(&;#w@zKpy^twE72Oq8 zIzj~~-aVyA&~FyV<(tm(6}l^T`~i2t`?4;uhAbTddC~HD(Pi(+&r;;J^hf?6OdK1b zYV;w<9_M$Whd6uKfFZwY8uRwkLfF0%hVYM} zHztuu4w@Y3zUk)3z!hcZVmsKvHS96~I(an1uS2MtAr`Pc;SoVV(8=YrH^BCYBg{x; zX86@%<#=y(?pS&`c{EZkTkD$I=3u@9tD(DhLe389kqnBwvC@yqF=!Lc3ucemOtaloUnI=pY!SS8rEW91v-j*wGMG-eW3Y!5pc5o@^M+j+U3ySnIV^ zY|apWBDtnNN!0qK2~da4VjoE;_vZAtb;yM1!9(B@aBWjR6%*C!W2>e&-fEs zk&E=@ya$Kdp?dRaOy9R^Rz#m)s_kz8244r05{U&pv9sX4qQ2-sFQzuS^9YrBt7*B- zX%(;Nszup3T0pQk+nCp1dI0-IldeVWIEQIpcgr=yKE~X1A$;#M*6D4lp z;w?jW2ePnF$f4iA!lpZ^yrKv3MgSok5d8rg3nM46=`&|1OFQ$Q=rN6v0s(pOLiVKL zkKZ*6eE$%}u=!!5CrJJh7&=>qVZ&r?ruDs95X2_!<-M0G*tlMuBbH*HABXAX{@l$l z?2?y3+*4@VVDEiP7#*$<2@|daLn%&y=!1{hU>z};ubba=H|fFgF3Dk(=j{Ah0y;5T z*@{B)3?pJwUB%Orufto9z-Gs#NDmL2rrDLVZTwqpv%3eFl-Y`1I?A>iu5cxLp(ObWba zRLB!l(vv~&c%3CZsLa}WnD^m1Q@%uDWvXTCJD?BEd1}*xiHP|`#Sk8UYEPfjll^~H znohM+$z%{SVT+Is@DWOrslAD+lf46k%f#LZ{6|0Ye*zOCOkT&Sf!Yze$vg`DagSGO z<+N#Vbr6e}(|sW|Z>*kKEt`>8y`7Ui<>{iQYe6wR!l!-0Nh^^-#aUbqO;$=$Bf)Rb zU)yMaT~R!~6)p*fi(zKT%#ty8IvM<&@*W@LJ`MgSgxx?`9`n%G%v|( zsO>?kQDCiY8VOmn-IJnN8e=p*lkI3L$q&ewQ%EMF1U(peD`A!Ii+7O})pMnSq*O*3 zK1=8$=Ul%BLY)@Jh^cKioFlw?w4CssoVIFbmDx^zNo9LuLcFQ&wm(VT4kQO74+yiI>Ko!4U-03! zzvr2#4fbF=XxXqXMoi{v`Iq?RgdicWJ;p+g(uB=y$m|srxqIoSv2Gh zSm_jAf|WnMn6tTmAQ6@XS$bo%l5fntsx`fjK{;M6oXX zKS~qQ9S#I*{`;YjKUeL~=f4~gQ3d_o!Qb20e;WSwOhQ28FU{=RhPPXKzf2nu|9Y#z zciZ^yJ%(SV0DuMNPvifm<8a&0ZQ=Qsr&R3!{KP+G=-Xay3wFP}ZPVMu&@a;-ir-Cdmqxce+^#8pdFY_LJqQ0