From 986f904d87e4b19f7ddc0cfa070784b512b99eec Mon Sep 17 00:00:00 2001 From: Sandra Chung Date: Mon, 7 Dec 2020 16:00:26 -0700 Subject: [PATCH 01/52] requirements.txt --- .gitignore | 2 ++ README_local.md | 2 ++ requirements.txt | 9 +++++++++ 3 files changed, 13 insertions(+) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 7c21aed..9f7cd3b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ *.DS_Store *.csv +__pycache__ +*.pyc diff --git a/README_local.md b/README_local.md index 0f831f5..6e55771 100644 --- a/README_local.md +++ b/README_local.md @@ -11,6 +11,8 @@ This document describes how to use 5 versions of name extraction scripts for vot - Levenshtein `pip install python-Levenshtein` - NLTK `pip install nltk` +- Alternatively, if you are in an environment where you can't / don't want to install Anaconda, create and activate your Python 3 virtualenv (see [pipenv and virtualenv](https://docs.python-guide.org/dev/virtualenvs/)) and run `pip install -r requirements.txt` + ## Getting Started Find your use case below and add your input data to the appropriate place, then run the specified python script. All of these scripts should be run out of the directory `Projects/NLP/SMS_Annotation` diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d84b090 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +matplotlib==3.3.3 +nltk==3.5 +numpy==1.19.4 +pandas==1.1.5 +pathlib==1.0.1 +python-Levenshtein==0.12.0 +scikit-learn==0.23.2 +scipy==1.5.4 +spacy==2.3.4 \ No newline at end of file From c67d4eb36396379883802df680b4a2cad720442b Mon Sep 17 00:00:00 2001 From: Sandra Chung Date: Mon, 7 Dec 2020 16:00:26 -0700 Subject: [PATCH 02/52] requirements.txt --- .gitignore | 2 ++ README_local.md | 2 ++ requirements.txt | 9 +++++++++ 3 files changed, 13 insertions(+) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 7c21aed..9f7cd3b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ *.DS_Store *.csv +__pycache__ +*.pyc diff --git a/README_local.md b/README_local.md index 0f831f5..6e55771 100644 --- a/README_local.md +++ b/README_local.md @@ -11,6 +11,8 @@ This document describes how to use 5 versions of name extraction scripts for vot - Levenshtein `pip install python-Levenshtein` - NLTK `pip install nltk` +- Alternatively, if you are in an environment where you can't / don't want to install Anaconda, create and activate your Python 3 virtualenv (see [pipenv and virtualenv](https://docs.python-guide.org/dev/virtualenvs/)) and run `pip install -r requirements.txt` + ## Getting Started Find your use case below and add your input data to the appropriate place, then run the specified python script. All of these scripts should be run out of the directory `Projects/NLP/SMS_Annotation` diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d84b090 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +matplotlib==3.3.3 +nltk==3.5 +numpy==1.19.4 +pandas==1.1.5 +pathlib==1.0.1 +python-Levenshtein==0.12.0 +scikit-learn==0.23.2 +scipy==1.5.4 +spacy==2.3.4 \ No newline at end of file From de022eeaccba79b891aee97d156a89940c344782 Mon Sep 17 00:00:00 2001 From: Sandra Chung Date: Fri, 18 Dec 2020 16:23:33 -0700 Subject: [PATCH 03/52] stub of a Flask frontend --- .gitignore | 2 +- .../Code/__pycache__/utilities.cpython-36.pyc | Bin 18029 -> 0 bytes .../NLP/SMS_Annotation/Input_Data/.DS_Store | Bin 6148 -> 0 bytes README_local.md | 18 +++- app.py | 11 +++ requirements.txt | 3 +- templates/upload_form.html | 82 ++++++++++++++++++ 7 files changed, 112 insertions(+), 4 deletions(-) delete mode 100644 Projects/NLP/SMS_Annotation/Code/__pycache__/utilities.cpython-36.pyc delete mode 100644 Projects/NLP/SMS_Annotation/Input_Data/.DS_Store create mode 100644 app.py create mode 100644 templates/upload_form.html diff --git a/.gitignore b/.gitignore index 9f7cd3b..525f974 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -*.DS_Store +.DS_Store *.csv __pycache__ *.pyc diff --git a/Projects/NLP/SMS_Annotation/Code/__pycache__/utilities.cpython-36.pyc b/Projects/NLP/SMS_Annotation/Code/__pycache__/utilities.cpython-36.pyc deleted file mode 100644 index 153d16e9785cc3e3b1c0a078d87468ed572143e5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 18029 zcmch9d5|2}d0%(WeePnhSR4e&6-WXY5Clk&7AcAr2;!h1g9{i7l5lgu(d@k5-PxU; z8NBYr9(iLc0VP0=>Ci4GjxE}zqr{Pwt71FOKZ)ZwaSrEj;y8}nNu`vy$}+1`u2dy| z@t@@P`(DrNfu&V)N?Fvres{m)`+nc?UgOzHrTD_9KB#|T#c}??+45IH{AK*YGoIrp z$9I&gyteDR^7MR9o+&?tXR4iE&iEOZb<*wZa?a05+-v9kg48SeB|J0j@^ZzmxK5B! z*}JY^4N{9EDu>u;;4bb_d4ywkJ%6vH3Q1~QQpF^-Pg12MH6f{TlG?8-{sC3>4@&Mx zl6y!}d;|M_NbON$YOfks`_zQmKcDjts{`ubJC6Ub>Z*s-;dfmB5!F)LkjaP*15-cpg<=a4>i% z*cXfkhtz5H^juy&qt2k_G4-r^4mFM|4|Vp=rPTMRPa?mD)C6kpQ_s(3)C=lU$UC84 zRG&uOCzVz&sqaP3NfoNI>Se^9Qs>kwC_AMhbwOQ3&S~{L{#{b9%3CLb{pz!G1$9|{ z4rNcP?^9RQYideeRoB$_tLtidzMyWX*VP;IDa_(cbrZFoQ7dXj-9qe)x})A!b;O=k zpH~gM=Q%a2no6la&8c~{pjv8CEvfc=Y0g#4s)L%}qr7RSwz~1j;&~l3qCib{JCipS zRwl1Dqshq^CZGP)i)WsD@yrX8XP!Rubg}qyVK7*)?Ug79iyx8*r)q9IGTrH} z&NkZZ=_puU4m7f|3t`k~E+Lk_)`%A3T+o?sx59Mll!x3@4bJh~mD z8Y}Y)Of(l-ZKZ>bjmR6&69={2UB*uus)jOiSU}XD(-2PqL8!cp+y)OsdZu*8*CO`J?_T03qZXWt5Moy8`cx0|rs)q&{+n$ztC-5wTf0ezUT ziKcBc%R$MQ=L6I21SVSTVl#CRm^p6y9G5+8t(h>mgXQ7tBdjVRe=o%Z+x!EOrAk+X7LT(3rysbg{DB zGpgHZv=#E(jZUj!T6Y?qB|OxUiPp{fs#$NWn{KxUDwS|O80>pv!JwXLwcDm^)=e9; zwX_|AkU{9%s2;o<7}bg*%zn9L7MG0f&b3MCS3$|GWtrjR99DjLeX`S74kp1D5txI| z)rIbsB#5TdjZC=G>veU6hHcPl5NNh$hxwU~DVmv?&c-CRsiS6Q%~3P+l$kkgW=@%t zW>9QqKE3h8?%dgI^zZ^qBM5J^%dex_pK|l;dWUvjw(pQ#f37s=GY?AWAcn15?RJhu z2v^#QkxtieMMAUC2$}Bb?j7u@6c6 zEH@!EM4@~ zkY=a+;^j_nC8}R&M2$F00O>pf2+RkaW>*0Zn&1!l<4aTcP2(4iBbdj_Zk79Z-J)~X z`7!TydfNF#2W>O$?kW(~i@j4%$8J-XQCWhIe&F=$A$aVxMdQ;Ky3LhkphtN6)mHS` zmD$r<`Q!98-CYctD4e=-?ez3()Ae&5EHDPx?VLW}Rl(_%sMT%(3&T^r^^b}#0jr~C zce(eO8b`?CW}S^A?QTQW=Px%kj8`v6qY7QO7`4!k(u3b_df_nmcr(mWne` zW44Wfa!j+0Fz|~I1N9zwrxCKa*}BsTW3Scqi*}k!V}1o`$En4hn$?HVo!-x2xN&w~ zMVXiMauUU15rO08@z29gPa@SMh47Cny4y4`=wuomC-QCzx*SNVma@~RdHNx=)Q1q% zQg$d0BkqrJQga$(tVi9ti~%F})STuT=|>sJz;}$f9IO#uY>Q`6E7=JSv;q$#9IdeN!7F$J?zX;9a|WCs4IhC%8!9%=u71hWUNuPHuQJ zr#8G(lOMS!H@uTf9^J@Hf_^8_c4l>E^~u_?=$>crWZ+D2+7e6p6)k7Fir28v5_A$BjvsZP76pI}~j>f)RAE0?D()~{cDRTC3x z*|~`# zQVS}iWr5n%fE4V+~O4ZWe7|I{1<}cG{L{_5=4Okk%9Ew zrHnp6?VJ#G5Int=uVsg^-zh*S1VgGFMscRqQNh~I@#Q%_V)He;A|wYm7*5*L&m(r@ zqdhZF0ups#W_EdfW_AcUgrgzHypIH^F=1b!*Hl8oFe6naq2bNEtjY~(*j0H|FlnJ* z^Z`CAf{Igp@RXjDUdc6Cg~pC&<<36 z%Rm;;jcA~b5W^-}kTmqmC>=^sg4R%^Pa+}PHy{W~M>v=uuc$F&Be6x9DJ;{hWw&gr z^Ay?c)7UeSBYz6nL1FaN%o!*)!^N$6{=Ti#3hOdefO>h3ke#ge&INg>2VjS`yxWpQ zm|6msvIwsrJzNCpHPp)lxv8PrMF3BD%2_I4(%#fZ?#sy#!RE*B9O3((p^6j`(c=}uTjXNHaz*hW1V{YC}L-KK6ta98wJ;BtV)li0M0 zk=szqtqx3}54my80s{Vx;Wtz$Phnq`OI)Sk(W7cFv>=!9e$iSwk++yKo+;k-Xz%n( zQQDO5x+=Tmg>NMB+>)oifqEHeN2tf8(BGJM`sKB@ZLJFX9oxgLoBfKZP^VGel2>*# zYd~HgFAuzD+kBFjc_1$X+^-DknDS!5_H@^Eo!jj<9gL=ok(D?aQ|_0H`<|;Hrru{rzkj*gGHR56#) zr8LTZ-Be6A>Zhfn_@)t1%Ldm=Rb^W3YMmM5AU8=31De%W!<^l7QfiAzKWvK@8eS zke3*=2L=W$_S6f344cL+ID(2Q9-_W^x{6fn?nExV2iZS@-@pbv_8^jlyBVS%l2Je8 zur^A^2TIP-+Jk@*ShmsY!2@Yk@Nue^^yJk=Keq;0JPKD{y&XhQ#6`%2aRx4_K*vS6k6Mw~ zYVoKQfvvf9+bYyV*xFc}pV^6&2lyET zl%985!D=0EUk?5KNeWu>e6&!fgE3s_wv|7c6aXyY6o*^XO6Ejz3=*;!{+JA@E(wIN z5yCjlJlYI)l*uGw5$_>9Bak}r=+33_CxW#|H=6LW+m-nk!L)Z5`ZkzccpZV0Pp9%O ze-o+WP~0ZGswi@!?wD6bepZZ+2``;WyG3s-H379Qou0t6nmXVX-7M-Hc8h3VbVrd_ zcJ*86b!e94kQhpMDvJuf63B+s+GhyZWYMh#!fcMPJ4%^yd)3@-JPXw>N}qyw!o*1x5?q(tdfdp}Y0f zR%fXlXrWCQhU7PYL~Q;L%Wp|g`^?QPt_oSkE;fq2!0ETTlQ*YO^SAL!6rVJfvIu`t z8HLBnp_J0(HI(v%MXwcO%>;95k(2r9csPeN}{1d-$tKidzvnVeg=llCDarV z1PHXbSrIi^h??Bm1(U(zzSPf~yvZ)*^p~OxG-Bw}Si~|H73#OpogHe=&4oq=( zL*jWGhv>^g_&_5UuS#ahDyhU9K^{SyT47Uf5?$YN_!Z2^ay@6QUqcXA2l|m+^cq11WR2%3c~4OcA?R#O{^B6uOBTmztz!{?Yy#()IsD!gzWd`VOQw zty*jnY)@_le)+ZEu-l3)`PCULd%qZ^7Skp}?L{vj4oWx02^i+sl#(gww@nsW%-g0U z+ClO5@4zT3LWUPRi_$VZ(1^mL(i&;onp%?NrJ?;KT2puwj{4kER)6jG(awn|hkP8z zE;;(QOp*FgQ~-f6UIdX9A=8VCCB{sCvCO(4jH4ii3d|OzL z8V{80`I(%US)b3bur~Qbn8>G%^b_H+&~oR9r4|xOTnIYwGXs$j{S?rTMGSwBH4%g?tx(!QjQaCv@xSm3=?HKNVk?8@ z2)Co4wxUNM9mfy2;Q?X+tyTP_-8SJqw4`4~Kh|cTB8nJnXjseEe4;ajv8{Quptt0S zVU?fSco94KoqOM`EmnXTWz8{YGvXrou9#^L*ub9t0nA>XV^AIjw4(GJg2T|+5|=XV z&~Z43kUNx4=V2|y&h14jyllxXx>SVEA=5EB^^ok-##m9jCV++KF_I0Dgt4+8?R zwD(01p7Rbm_Z>*3wDE@8u&fAo6Fv{oh8GbR5e?^LVW@V++4VZuUfEg8o>RAfkrSqx zj`O8siHcO;!nV8V_8?liEuhINzUuRg=i_xl;>m*xH`nM$%Wz{ac-y@{45eYGdv*Se%QC~LrbDi#+pOyzn|`Bd9u)g zyjkm+EF{bVxt4G zOeIc%!s9&P5(k6bPVCmDno$2Gt5CQ?
HJ)dWz3h;OnspBk?pmlkoJHLe2!tqsC zt!4!L#kG?a9ax7Hv*5K!vZTT=MTqt*gP}@E^{(JgClIJ|*DPM?JxVCP|5Zq#&|@ z@Zs+wVp8|vzF)-tnKWat)Oqd*i@Q%v8Bg*eOH39zMg@Kq?>^3)Oy$0J*NZZXSyMo& z%6xhnwv^<+(b;ZwP`$^iB zRPOz9AN~N)P=0EN8D!icW>75?%$VrAm;`|tlfCQShoWTBLN)`ZrzseZazZCwvBZjw zQ`ig;W~m7?U?mg^i7Vl97dK(*Hg5-Z^tWKg$<2Mm#TdoIF}u0D0my^4_jf)C{ezh9 z1C!7{gj`JM3oQJz3`iSt*h zid#r=3ikP6Io$w8f0;x+AnzSx;gzWbdTZslY+1ot%kkJIm=0R=Ok&gS=B0*t0D5;e zE{Ogl(NeRkd)PZkRNJ_QWLeu5+Urf0d5d!WN#cSH@DKRJ3ixE>6kb^?3vI+X=%>6> z7N;4p$|YAm_BuTreRbz~T)gZTXe{#5Q@E*>`f2o2uZ`+2atrRUQO@2<3gZ+`?X=+4 z2wu8vV&&wjnh>CDBt(lPNtuwqRK>0qE{oxwjh3CpiH@aJw1wnDoc%2^wJ~l5xDOUK!ea zfd_167N>kh&ixSCD9jk#{=q$UI-p>iz(?hy0tRRARrcz_)#>yO=YkOQS-kn`otJ>EmiGUvW1CtuFpOCu+@ zB`1rcL-e1N7w(l&Uf5Eehud=xjsV^M-h1vn?_TO&=3e$*?q2?0;a(LtDT}NdWjQ8M zf5rvkO3XFmlCuv6-^ALlz{VbDom*dnwzI!~0A|;L`|kam8M~iR<##jvgNAO~o47WT zHwRVapfis?`-hl@W^^D)Ka`YLll0-JU=Epw%;7ou0Um}TSu_s=Rv!i2A5?oph>zJA zoVo|hezVu?!F%&?s=xz)bFgCn5%UNPyMr|B%*0~FJR&CEAHu||(jnV_G#a@Ne?Wf{ zPp-r#;ItjX^9Z~K1=PX3%%g^-FbJd3#gThOtRgNMU=1d?1}Oi8Il?+nTaFxX)cE_w z{$uveE$`c^L%eG{pYK1u_CD5WLe}X$tkVJc2egmh9)u@D@1LJ^kmTU2U`7ekkop zJLJCS(bz>u$I?DCA(}jcF)yj^G6{rbE^Wyw7prfFU^yDGW62Dd=fbPOM8-&XcHqCU|c( zZWn+V^yD0Q`)Ge=bF}|oMgd)Kmr?xew;RRb!CH)Ou7y=lHdi4z{X6;K+^Ycm&^i^xCPGAry`31AaBduQboxH5(u0sW8m?5a-^I+l4@D=*1xwrqQz&;oOi_ebs0Y|#y^c9EP@eHAr9Ow@aJyzrMd z_YI396=eJytjekRMGJ4V8V7OOj-pD-()wfVV6D~c!kmJc4_gj4BRi76-ABIjoXvU{ z2Ar;c7)`g1ANO|Wpwm{Z8+mY^vtuUT+Ax27YTigYS&Rc|I zGue)~#QjPVK8|{<;yc)^Sqv;L65?c0Lj5{2{IXoxA7p)rgTK$<%M4^Lf1a@)Veq32 z{sIC&!{xezhgjm?=6d3Qj`wl&6$Z~TSZ1(p z0#U>pq65a+l^zeftQ%Iy|0merk0Yp^&`;wraG$e=jXkfG@<(LclB=NbGQgAW+|H3na2 z@RtzyWi%X~1M0uXtb?1@NkWy00Uv73@-bYINCr3%$D;p>CE)MDz{K3I4+{P4O}h)Z zo3*^{i<2aJ(f^)E0O0+9aeOv;MND$9Dy z%72XoVrh&$sKI=RoJ0zC_EuqIRAF22%!U7W^SYQiS{V~xxb>4*Ur84B2mj$<5`L1+ zlFe8aK6CoW<)6oV+Aa(yo+W;(UE00RbpK1;a#^JI;Mm?k6-vX z1UQp}69i{+bpOJor{Iq1r`Jx%G3XIE^)e<6jWvt=nJGAUa1sbT8ZKj`voATf55wh_ zg-)8>#wMeS+YG*sTym)^&cZrg@A0~Gq-#09V4X}Ey+t#3l;m+1sE6$SSuY$0463VoZBi$7~pRveZ+hccvW+3n}(*qm_Wg3o)A zhr5F13H0JRbf&RKgs{%UAz8M^rkI6fK59MRS@-)k_Hhb}gaX6u6HQ~axajY9w zWj}4pWc~KRS}mx3?_$4Y+0)G{JO?8)Qy(}NHH5B_SSalS>TPj@vG2_0vk1$H7qC#zSE zp_{mH0bf~M;#U^JVsZ8kzSIdq{r6djyZYTmr2h^FLNQENRujC9&ns|55+bK|zB`kNFJ7;|0*t&6mtMbiZ8ITbt;;8IB81`;h1-a% z>v^a&aw7)ICmDm6__VBWS$SAoW_nP{+dYGl*y@S!sdZQ5aw1M8B_}bQMDN)|XyPSs z+HYzE0$rQLKX;5`^Km13iG<4{rfGvjyyO0%#Br;YlzM zQUYi*Zto$1s4;=4yc!XR0z@jD zUhRM=fTyZRp9hMVaOSTg2{`$qPt5xU1Hr-HXKV+PPCUq@RP%i@ zDi_Gf4;Ynfx|K=6oWh#8GqEdc4m3k*hhWYBioS+d0jR4D^)-6J;4uRkDVY8sZfIfj zOe@@`n5Tq>pLyJ|2m4@v6#pNCIp}M=qb0{#`ah$Go#gR1m?vAzrH4`cx0xXul46Xz z0#UIakGM+1W{Zhbyg3B581cb+#_r9X%Xi3@@i-DBhA);wx!K4i{7Eih9^E@$H8qhM z^YkAe?icaV+rWk`b@08&Faby&CaV0r0M_83aIe7m_RJkK_V;euzr*4Cqgyhi4}WA! z4yC~GEWC)}(a&=pKe?{BZBsIEj&qBkx?t&X3qev)Xpn-kC^&^f18h_t#|sJ8VYA@W zo=c++uD@ZkaF6MqqkH|E4E`B|Z?{E7yG45kOR;sIMrB4@b4EC$EgATqFQ|SDw@`l* z@AP3N+O?qFhz9z+PCBEkDDXtm}Td(~g!jhJx>n!e*-9~G0pUH^q${(3J_u9q!^hMZn=WblRKJAw!W6{+#RbrF3QeJ_%oaF G{(k{*BC&A* diff --git a/Projects/NLP/SMS_Annotation/Input_Data/.DS_Store b/Projects/NLP/SMS_Annotation/Input_Data/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 + + + + Vote tripling SMS transcript parsing + + + + + + + +
+ +

Vote tripling SMS transcript parsing

+ + + + +
+ +

VAN Export Cleaning

+

I have a VAN Export and I need to extract any tripling target names from the note text.

+ +

Upload a csv file with the following columns:

+
    +
  • *VANID* a unique ID for this row +
  • *ContactName* the name of the tripler +
  • *NoteText* free text possibly including names of tripling targets +
+ +
+ + +
+ +

If successful, the output will include two files: +

  • 1. A file of triplers called `van_cleaned.csv`. For each tripler, we provide the following fields (each row represents one text message conversation): +
    • - *VANID* a unique identifier for the conversation +
    • - *names_extract* the extracted names +
    + +
  • 2. A file of conversations for manual review called `van_manual_review.csv`, with the following fields: +
    • - *VANID* a unique identifier for the conversation +
    • - *ContactName* a unique identifier for the conversation +
    • - *NoteText* free text possibly including names of tripling targets +
    • - *names_extract* a guess for the extracted names (to be reviewed) +
    +
+ +
+ +

Text Banker Log Cleaning (utilizing text message conversation)

+

I have text banker logs for names provided by vote triplers. I also have access to the initial text conversation. I need these logs cleaned up and standardized.

+ +

We use a different script for these cases, because we can clean up the logs better and perform spell check by looking at the original messages.

+ +

Upload a csv file of the same format as the output of the aggregation in step 1. + This csv file must also contain column 'names' containing the names logged by a text banker.

+ +
+ + +
+ +

If successful, output will be a file named `labeled_names_cleaned_with_response.csv` with the cleaned names in a column titled "clean_names", along with any other columns in the initial file.

+ +
+ +
+ + + + \ No newline at end of file From 5cb21a9fcafca245231e14452bc7075087cfe9de Mon Sep 17 00:00:00 2001 From: Sandra Chung Date: Fri, 18 Dec 2020 16:23:33 -0700 Subject: [PATCH 04/52] stub of a Flask frontend --- .gitignore | 2 +- .../Code/__pycache__/utilities.cpython-36.pyc | Bin 18029 -> 0 bytes .../NLP/SMS_Annotation/Input_Data/.DS_Store | Bin 6148 -> 0 bytes README_local.md | 18 +++- app.py | 11 +++ requirements.txt | 3 +- templates/upload_form.html | 82 ++++++++++++++++++ 7 files changed, 112 insertions(+), 4 deletions(-) delete mode 100644 Projects/NLP/SMS_Annotation/Code/__pycache__/utilities.cpython-36.pyc delete mode 100644 Projects/NLP/SMS_Annotation/Input_Data/.DS_Store create mode 100644 app.py create mode 100644 templates/upload_form.html diff --git a/.gitignore b/.gitignore index 9f7cd3b..525f974 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -*.DS_Store +.DS_Store *.csv __pycache__ *.pyc diff --git a/Projects/NLP/SMS_Annotation/Code/__pycache__/utilities.cpython-36.pyc b/Projects/NLP/SMS_Annotation/Code/__pycache__/utilities.cpython-36.pyc deleted file mode 100644 index 153d16e9785cc3e3b1c0a078d87468ed572143e5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 18029 zcmch9d5|2}d0%(WeePnhSR4e&6-WXY5Clk&7AcAr2;!h1g9{i7l5lgu(d@k5-PxU; z8NBYr9(iLc0VP0=>Ci4GjxE}zqr{Pwt71FOKZ)ZwaSrEj;y8}nNu`vy$}+1`u2dy| z@t@@P`(DrNfu&V)N?Fvres{m)`+nc?UgOzHrTD_9KB#|T#c}??+45IH{AK*YGoIrp z$9I&gyteDR^7MR9o+&?tXR4iE&iEOZb<*wZa?a05+-v9kg48SeB|J0j@^ZzmxK5B! z*}JY^4N{9EDu>u;;4bb_d4ywkJ%6vH3Q1~QQpF^-Pg12MH6f{TlG?8-{sC3>4@&Mx zl6y!}d;|M_NbON$YOfks`_zQmKcDjts{`ubJC6Ub>Z*s-;dfmB5!F)LkjaP*15-cpg<=a4>i% z*cXfkhtz5H^juy&qt2k_G4-r^4mFM|4|Vp=rPTMRPa?mD)C6kpQ_s(3)C=lU$UC84 zRG&uOCzVz&sqaP3NfoNI>Se^9Qs>kwC_AMhbwOQ3&S~{L{#{b9%3CLb{pz!G1$9|{ z4rNcP?^9RQYideeRoB$_tLtidzMyWX*VP;IDa_(cbrZFoQ7dXj-9qe)x})A!b;O=k zpH~gM=Q%a2no6la&8c~{pjv8CEvfc=Y0g#4s)L%}qr7RSwz~1j;&~l3qCib{JCipS zRwl1Dqshq^CZGP)i)WsD@yrX8XP!Rubg}qyVK7*)?Ug79iyx8*r)q9IGTrH} z&NkZZ=_puU4m7f|3t`k~E+Lk_)`%A3T+o?sx59Mll!x3@4bJh~mD z8Y}Y)Of(l-ZKZ>bjmR6&69={2UB*uus)jOiSU}XD(-2PqL8!cp+y)OsdZu*8*CO`J?_T03qZXWt5Moy8`cx0|rs)q&{+n$ztC-5wTf0ezUT ziKcBc%R$MQ=L6I21SVSTVl#CRm^p6y9G5+8t(h>mgXQ7tBdjVRe=o%Z+x!EOrAk+X7LT(3rysbg{DB zGpgHZv=#E(jZUj!T6Y?qB|OxUiPp{fs#$NWn{KxUDwS|O80>pv!JwXLwcDm^)=e9; zwX_|AkU{9%s2;o<7}bg*%zn9L7MG0f&b3MCS3$|GWtrjR99DjLeX`S74kp1D5txI| z)rIbsB#5TdjZC=G>veU6hHcPl5NNh$hxwU~DVmv?&c-CRsiS6Q%~3P+l$kkgW=@%t zW>9QqKE3h8?%dgI^zZ^qBM5J^%dex_pK|l;dWUvjw(pQ#f37s=GY?AWAcn15?RJhu z2v^#QkxtieMMAUC2$}Bb?j7u@6c6 zEH@!EM4@~ zkY=a+;^j_nC8}R&M2$F00O>pf2+RkaW>*0Zn&1!l<4aTcP2(4iBbdj_Zk79Z-J)~X z`7!TydfNF#2W>O$?kW(~i@j4%$8J-XQCWhIe&F=$A$aVxMdQ;Ky3LhkphtN6)mHS` zmD$r<`Q!98-CYctD4e=-?ez3()Ae&5EHDPx?VLW}Rl(_%sMT%(3&T^r^^b}#0jr~C zce(eO8b`?CW}S^A?QTQW=Px%kj8`v6qY7QO7`4!k(u3b_df_nmcr(mWne` zW44Wfa!j+0Fz|~I1N9zwrxCKa*}BsTW3Scqi*}k!V}1o`$En4hn$?HVo!-x2xN&w~ zMVXiMauUU15rO08@z29gPa@SMh47Cny4y4`=wuomC-QCzx*SNVma@~RdHNx=)Q1q% zQg$d0BkqrJQga$(tVi9ti~%F})STuT=|>sJz;}$f9IO#uY>Q`6E7=JSv;q$#9IdeN!7F$J?zX;9a|WCs4IhC%8!9%=u71hWUNuPHuQJ zr#8G(lOMS!H@uTf9^J@Hf_^8_c4l>E^~u_?=$>crWZ+D2+7e6p6)k7Fir28v5_A$BjvsZP76pI}~j>f)RAE0?D()~{cDRTC3x z*|~`# zQVS}iWr5n%fE4V+~O4ZWe7|I{1<}cG{L{_5=4Okk%9Ew zrHnp6?VJ#G5Int=uVsg^-zh*S1VgGFMscRqQNh~I@#Q%_V)He;A|wYm7*5*L&m(r@ zqdhZF0ups#W_EdfW_AcUgrgzHypIH^F=1b!*Hl8oFe6naq2bNEtjY~(*j0H|FlnJ* z^Z`CAf{Igp@RXjDUdc6Cg~pC&<<36 z%Rm;;jcA~b5W^-}kTmqmC>=^sg4R%^Pa+}PHy{W~M>v=uuc$F&Be6x9DJ;{hWw&gr z^Ay?c)7UeSBYz6nL1FaN%o!*)!^N$6{=Ti#3hOdefO>h3ke#ge&INg>2VjS`yxWpQ zm|6msvIwsrJzNCpHPp)lxv8PrMF3BD%2_I4(%#fZ?#sy#!RE*B9O3((p^6j`(c=}uTjXNHaz*hW1V{YC}L-KK6ta98wJ;BtV)li0M0 zk=szqtqx3}54my80s{Vx;Wtz$Phnq`OI)Sk(W7cFv>=!9e$iSwk++yKo+;k-Xz%n( zQQDO5x+=Tmg>NMB+>)oifqEHeN2tf8(BGJM`sKB@ZLJFX9oxgLoBfKZP^VGel2>*# zYd~HgFAuzD+kBFjc_1$X+^-DknDS!5_H@^Eo!jj<9gL=ok(D?aQ|_0H`<|;Hrru{rzkj*gGHR56#) zr8LTZ-Be6A>Zhfn_@)t1%Ldm=Rb^W3YMmM5AU8=31De%W!<^l7QfiAzKWvK@8eS zke3*=2L=W$_S6f344cL+ID(2Q9-_W^x{6fn?nExV2iZS@-@pbv_8^jlyBVS%l2Je8 zur^A^2TIP-+Jk@*ShmsY!2@Yk@Nue^^yJk=Keq;0JPKD{y&XhQ#6`%2aRx4_K*vS6k6Mw~ zYVoKQfvvf9+bYyV*xFc}pV^6&2lyET zl%985!D=0EUk?5KNeWu>e6&!fgE3s_wv|7c6aXyY6o*^XO6Ejz3=*;!{+JA@E(wIN z5yCjlJlYI)l*uGw5$_>9Bak}r=+33_CxW#|H=6LW+m-nk!L)Z5`ZkzccpZV0Pp9%O ze-o+WP~0ZGswi@!?wD6bepZZ+2``;WyG3s-H379Qou0t6nmXVX-7M-Hc8h3VbVrd_ zcJ*86b!e94kQhpMDvJuf63B+s+GhyZWYMh#!fcMPJ4%^yd)3@-JPXw>N}qyw!o*1x5?q(tdfdp}Y0f zR%fXlXrWCQhU7PYL~Q;L%Wp|g`^?QPt_oSkE;fq2!0ETTlQ*YO^SAL!6rVJfvIu`t z8HLBnp_J0(HI(v%MXwcO%>;95k(2r9csPeN}{1d-$tKidzvnVeg=llCDarV z1PHXbSrIi^h??Bm1(U(zzSPf~yvZ)*^p~OxG-Bw}Si~|H73#OpogHe=&4oq=( zL*jWGhv>^g_&_5UuS#ahDyhU9K^{SyT47Uf5?$YN_!Z2^ay@6QUqcXA2l|m+^cq11WR2%3c~4OcA?R#O{^B6uOBTmztz!{?Yy#()IsD!gzWd`VOQw zty*jnY)@_le)+ZEu-l3)`PCULd%qZ^7Skp}?L{vj4oWx02^i+sl#(gww@nsW%-g0U z+ClO5@4zT3LWUPRi_$VZ(1^mL(i&;onp%?NrJ?;KT2puwj{4kER)6jG(awn|hkP8z zE;;(QOp*FgQ~-f6UIdX9A=8VCCB{sCvCO(4jH4ii3d|OzL z8V{80`I(%US)b3bur~Qbn8>G%^b_H+&~oR9r4|xOTnIYwGXs$j{S?rTMGSwBH4%g?tx(!QjQaCv@xSm3=?HKNVk?8@ z2)Co4wxUNM9mfy2;Q?X+tyTP_-8SJqw4`4~Kh|cTB8nJnXjseEe4;ajv8{Quptt0S zVU?fSco94KoqOM`EmnXTWz8{YGvXrou9#^L*ub9t0nA>XV^AIjw4(GJg2T|+5|=XV z&~Z43kUNx4=V2|y&h14jyllxXx>SVEA=5EB^^ok-##m9jCV++KF_I0Dgt4+8?R zwD(01p7Rbm_Z>*3wDE@8u&fAo6Fv{oh8GbR5e?^LVW@V++4VZuUfEg8o>RAfkrSqx zj`O8siHcO;!nV8V_8?liEuhINzUuRg=i_xl;>m*xH`nM$%Wz{ac-y@{45eYGdv*Se%QC~LrbDi#+pOyzn|`Bd9u)g zyjkm+EF{bVxt4G zOeIc%!s9&P5(k6bPVCmDno$2Gt5CQ?
HJ)dWz3h;OnspBk?pmlkoJHLe2!tqsC zt!4!L#kG?a9ax7Hv*5K!vZTT=MTqt*gP}@E^{(JgClIJ|*DPM?JxVCP|5Zq#&|@ z@Zs+wVp8|vzF)-tnKWat)Oqd*i@Q%v8Bg*eOH39zMg@Kq?>^3)Oy$0J*NZZXSyMo& z%6xhnwv^<+(b;ZwP`$^iB zRPOz9AN~N)P=0EN8D!icW>75?%$VrAm;`|tlfCQShoWTBLN)`ZrzseZazZCwvBZjw zQ`ig;W~m7?U?mg^i7Vl97dK(*Hg5-Z^tWKg$<2Mm#TdoIF}u0D0my^4_jf)C{ezh9 z1C!7{gj`JM3oQJz3`iSt*h zid#r=3ikP6Io$w8f0;x+AnzSx;gzWbdTZslY+1ot%kkJIm=0R=Ok&gS=B0*t0D5;e zE{Ogl(NeRkd)PZkRNJ_QWLeu5+Urf0d5d!WN#cSH@DKRJ3ixE>6kb^?3vI+X=%>6> z7N;4p$|YAm_BuTreRbz~T)gZTXe{#5Q@E*>`f2o2uZ`+2atrRUQO@2<3gZ+`?X=+4 z2wu8vV&&wjnh>CDBt(lPNtuwqRK>0qE{oxwjh3CpiH@aJw1wnDoc%2^wJ~l5xDOUK!ea zfd_167N>kh&ixSCD9jk#{=q$UI-p>iz(?hy0tRRARrcz_)#>yO=YkOQS-kn`otJ>EmiGUvW1CtuFpOCu+@ zB`1rcL-e1N7w(l&Uf5Eehud=xjsV^M-h1vn?_TO&=3e$*?q2?0;a(LtDT}NdWjQ8M zf5rvkO3XFmlCuv6-^ALlz{VbDom*dnwzI!~0A|;L`|kam8M~iR<##jvgNAO~o47WT zHwRVapfis?`-hl@W^^D)Ka`YLll0-JU=Epw%;7ou0Um}TSu_s=Rv!i2A5?oph>zJA zoVo|hezVu?!F%&?s=xz)bFgCn5%UNPyMr|B%*0~FJR&CEAHu||(jnV_G#a@Ne?Wf{ zPp-r#;ItjX^9Z~K1=PX3%%g^-FbJd3#gThOtRgNMU=1d?1}Oi8Il?+nTaFxX)cE_w z{$uveE$`c^L%eG{pYK1u_CD5WLe}X$tkVJc2egmh9)u@D@1LJ^kmTU2U`7ekkop zJLJCS(bz>u$I?DCA(}jcF)yj^G6{rbE^Wyw7prfFU^yDGW62Dd=fbPOM8-&XcHqCU|c( zZWn+V^yD0Q`)Ge=bF}|oMgd)Kmr?xew;RRb!CH)Ou7y=lHdi4z{X6;K+^Ycm&^i^xCPGAry`31AaBduQboxH5(u0sW8m?5a-^I+l4@D=*1xwrqQz&;oOi_ebs0Y|#y^c9EP@eHAr9Ow@aJyzrMd z_YI396=eJytjekRMGJ4V8V7OOj-pD-()wfVV6D~c!kmJc4_gj4BRi76-ABIjoXvU{ z2Ar;c7)`g1ANO|Wpwm{Z8+mY^vtuUT+Ax27YTigYS&Rc|I zGue)~#QjPVK8|{<;yc)^Sqv;L65?c0Lj5{2{IXoxA7p)rgTK$<%M4^Lf1a@)Veq32 z{sIC&!{xezhgjm?=6d3Qj`wl&6$Z~TSZ1(p z0#U>pq65a+l^zeftQ%Iy|0merk0Yp^&`;wraG$e=jXkfG@<(LclB=NbGQgAW+|H3na2 z@RtzyWi%X~1M0uXtb?1@NkWy00Uv73@-bYINCr3%$D;p>CE)MDz{K3I4+{P4O}h)Z zo3*^{i<2aJ(f^)E0O0+9aeOv;MND$9Dy z%72XoVrh&$sKI=RoJ0zC_EuqIRAF22%!U7W^SYQiS{V~xxb>4*Ur84B2mj$<5`L1+ zlFe8aK6CoW<)6oV+Aa(yo+W;(UE00RbpK1;a#^JI;Mm?k6-vX z1UQp}69i{+bpOJor{Iq1r`Jx%G3XIE^)e<6jWvt=nJGAUa1sbT8ZKj`voATf55wh_ zg-)8>#wMeS+YG*sTym)^&cZrg@A0~Gq-#09V4X}Ey+t#3l;m+1sE6$SSuY$0463VoZBi$7~pRveZ+hccvW+3n}(*qm_Wg3o)A zhr5F13H0JRbf&RKgs{%UAz8M^rkI6fK59MRS@-)k_Hhb}gaX6u6HQ~axajY9w zWj}4pWc~KRS}mx3?_$4Y+0)G{JO?8)Qy(}NHH5B_SSalS>TPj@vG2_0vk1$H7qC#zSE zp_{mH0bf~M;#U^JVsZ8kzSIdq{r6djyZYTmr2h^FLNQENRujC9&ns|55+bK|zB`kNFJ7;|0*t&6mtMbiZ8ITbt;;8IB81`;h1-a% z>v^a&aw7)ICmDm6__VBWS$SAoW_nP{+dYGl*y@S!sdZQ5aw1M8B_}bQMDN)|XyPSs z+HYzE0$rQLKX;5`^Km13iG<4{rfGvjyyO0%#Br;YlzM zQUYi*Zto$1s4;=4yc!XR0z@jD zUhRM=fTyZRp9hMVaOSTg2{`$qPt5xU1Hr-HXKV+PPCUq@RP%i@ zDi_Gf4;Ynfx|K=6oWh#8GqEdc4m3k*hhWYBioS+d0jR4D^)-6J;4uRkDVY8sZfIfj zOe@@`n5Tq>pLyJ|2m4@v6#pNCIp}M=qb0{#`ah$Go#gR1m?vAzrH4`cx0xXul46Xz z0#UIakGM+1W{Zhbyg3B581cb+#_r9X%Xi3@@i-DBhA);wx!K4i{7Eih9^E@$H8qhM z^YkAe?icaV+rWk`b@08&Faby&CaV0r0M_83aIe7m_RJkK_V;euzr*4Cqgyhi4}WA! z4yC~GEWC)}(a&=pKe?{BZBsIEj&qBkx?t&X3qev)Xpn-kC^&^f18h_t#|sJ8VYA@W zo=c++uD@ZkaF6MqqkH|E4E`B|Z?{E7yG45kOR;sIMrB4@b4EC$EgATqFQ|SDw@`l* z@AP3N+O?qFhz9z+PCBEkDDXtm}Td(~g!jhJx>n!e*-9~G0pUH^q${(3J_u9q!^hMZn=WblRKJAw!W6{+#RbrF3QeJ_%oaF G{(k{*BC&A* diff --git a/Projects/NLP/SMS_Annotation/Input_Data/.DS_Store b/Projects/NLP/SMS_Annotation/Input_Data/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 + + + + Vote tripling SMS transcript parsing + + + + + + + +
+ +

Vote tripling SMS transcript parsing

+ + + + +
+ +

VAN Export Cleaning

+

I have a VAN Export and I need to extract any tripling target names from the note text.

+ +

Upload a csv file with the following columns:

+
    +
  • *VANID* a unique ID for this row +
  • *ContactName* the name of the tripler +
  • *NoteText* free text possibly including names of tripling targets +
+ +
+ + +
+ +

If successful, the output will include two files: +

  • 1. A file of triplers called `van_cleaned.csv`. For each tripler, we provide the following fields (each row represents one text message conversation): +
    • - *VANID* a unique identifier for the conversation +
    • - *names_extract* the extracted names +
    + +
  • 2. A file of conversations for manual review called `van_manual_review.csv`, with the following fields: +
    • - *VANID* a unique identifier for the conversation +
    • - *ContactName* a unique identifier for the conversation +
    • - *NoteText* free text possibly including names of tripling targets +
    • - *names_extract* a guess for the extracted names (to be reviewed) +
    +
+ +
+ +

Text Banker Log Cleaning (utilizing text message conversation)

+

I have text banker logs for names provided by vote triplers. I also have access to the initial text conversation. I need these logs cleaned up and standardized.

+ +

We use a different script for these cases, because we can clean up the logs better and perform spell check by looking at the original messages.

+ +

Upload a csv file of the same format as the output of the aggregation in step 1. + This csv file must also contain column 'names' containing the names logged by a text banker.

+ +
+ + +
+ +

If successful, output will be a file named `labeled_names_cleaned_with_response.csv` with the cleaned names in a column titled "clean_names", along with any other columns in the initial file.

+ +
+ +
+ + + + \ No newline at end of file From 3cdb61c556d2493393b8aaed84022dfdb2497b2c Mon Sep 17 00:00:00 2001 From: Sandra Chung Date: Sat, 2 Jan 2021 20:33:46 -0700 Subject: [PATCH 05/52] some thoughts on the backend --- .gitignore | 1 + app.py | 67 +++++++++++++++++-- settings.py.example | 3 + templates/upload_form.html | 127 +++++++++++++++++++++++++++++-------- 4 files changed, 166 insertions(+), 32 deletions(-) create mode 100644 settings.py.example diff --git a/.gitignore b/.gitignore index 525f974..e40a865 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ *.csv __pycache__ *.pyc +settings.py diff --git a/app.py b/app.py index 133ddb6..6cb8fce 100644 --- a/app.py +++ b/app.py @@ -1,11 +1,66 @@ -from flask import Flask, request, render_template +from flask import (flash, Flask, g, redirect, render_template, request, + send_from_directory, url_for) +from werkzeug.utils import secure_filename +import os +import settings +import sqlite3 + +# configure app app = Flask(__name__) +APP_ROOT = os.path.dirname(os.path.abspath(__file__)) +app.config['UPLOAD_FOLDER'] = os.path.join(APP_ROOT, 'Projects/NLP/SMS_Annotation/Input_data') + +ALLOWED_EXTENSIONS = settings.ALLOWED_EXTENSIONS +app.secret_key = settings.SECRET_KEY +app.config['MAX_CONTENT_LENGTH'] = settings.MAX_CONTENT_LENGTH + + +# check filename extension on uploaded files +def allowed_file(filename): + return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS + +def queue_job(type, message): + return + +def process_job(job_id): + return + +def complete_job(job_id): + return -@app.route('/') +def delete_file(filename): + return + +# most of the business is here +@app.route('/', methods=['GET', 'POST']) def index(): if request.method == 'POST': - import pdb; pdb.set_trace(); - # handle file upload - return + # check if the post request has the file part + if 'vec_file' not in request.files: + flash('No file part') + return redirect(request.url) + file = request.files['vec_file'] + # if user does not select file, browser also + # submit an empty part without filename + if file.filename == '': + flash('No selected file') + return redirect(request.url) + if file and allowed_file(file.filename): + filename = secure_filename(file.filename) + file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) + return redirect(url_for('uploaded_file', + filename=filename)) else: - return render_template('upload_form.html') \ No newline at end of file + return render_template('upload_form.html') + +# returning uploaded files +@app.route('/uploads/') +def uploaded_file(filename): + return send_from_directory(app.config['UPLOAD_FOLDER'], + filename) + # TODO: documentation on setting file directory permissions + # i.e. uploads dir can't be inspected + + + + diff --git a/settings.py.example b/settings.py.example new file mode 100644 index 0000000..9a77ab3 --- /dev/null +++ b/settings.py.example @@ -0,0 +1,3 @@ +ALLOWED_EXTENSIONS = {'csv'} +SECRET_KEY = '' # keep this secret in production! +MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16 MB diff --git a/templates/upload_form.html b/templates/upload_form.html index 363344f..15db708 100644 --- a/templates/upload_form.html +++ b/templates/upload_form.html @@ -5,12 +5,19 @@ Vote tripling SMS transcript parsing + @@ -20,46 +27,112 @@

Vote tripling SMS transcript parsing

-