From 0b1457c0be94d0917652c6c367ad74ce27394b2d Mon Sep 17 00:00:00 2001 From: Justin G Date: Wed, 22 Jan 2014 12:57:12 -0600 Subject: [PATCH 1/3] Implement tests for --leading_zeros option --- test/fixtures/leading_zeros.pdf | Bin 0 -> 18777 bytes test/unit/test_extract_images.rb | 9 +++++++++ test/unit/test_extract_pages.rb | 13 +++++++++++++ test/unit/test_extract_text.rb | 9 +++++++++ 4 files changed, 31 insertions(+) create mode 100644 test/fixtures/leading_zeros.pdf diff --git a/test/fixtures/leading_zeros.pdf b/test/fixtures/leading_zeros.pdf new file mode 100644 index 0000000000000000000000000000000000000000..18f84c18889dfeb7ed8e89504d919f382d3321a0 GIT binary patch literal 18777 zcmdUX1yodP_ctjiA>9MgJp)5`BhrmD4BahANOyNi3rGnHf`Ej8f`oK~v>*ynQho!% zHMrmZ-tSxM{jc@1#Nq6Da{r#a&#rT5lq95BfUF#-G#$%3%NxaC9}RSLqH+S*0rtjL zr~(22w!08Ja~BH$7)nwBut{0kxImnspEgD=5DAEhy(t7BB!ueh;si0WMfHHyQ|Li$ z6G4A@z|@)1^hCA+^Puz)Hpq`ciJ7Wg>G->$+Xy*z$X(imH_U6*eov^-RPbNg)qedR3YnG!DOPp&*_LLMwP%K-uwNZy8i~08(x=P>x zREVAF=YN8J*}d9*2MB;o$;kZTe*@<<{jq=l!m@tmI^G zq6%>V+=EUgAq8MlgLt?A^Z;z)_BQrTst!gb5WqPY;?6(-_r>r+LQoK(GhUF@)PMk< zb22KMni>eed)4CST)^`+TupWfDG+$kj!gl&B{tD>Tt8Iu76nv0yQ@U|%hmnIT9YO27@DGzq4qJ(lUL+23OIGi{t=Zc(Cj9{uhc zN5w5peFFwna)e93hn8R&E>X4SH4*4L!rR>>22d~%grAF&_k?P(qta%-c!jC)O==45 zQZBJ}AJI$B?8ru?*>swP{78gc-Bg-+^ZW-$kR~`k|r2%)2n`xM>ExtF*Z` zVIWCdqOGtUI2h!_4&Ike7!luiEwW)eip-b75yE7Ri?GL1Etl82&5pKfRKFPmnG)OO z7eYPm_V3oBN3#?7RoW*@!qRBlrdh;&DkuH)Nqcm~;#Rm|EK#BFhU;6h8xCb`W7|XEVQ&2ixe!cYHvnLyOql zv~h;bv1Vmj<37&uw}0Cb%zKd5vvsV&Uqm4M@U|=-2SjqYsJKW@qPG_vBcI6Gjm=MYzM8WE3YM@1cq5O1_?aWie)uL4fGvZ@FnD;Ln89Yvc|P=HGuIB z;#I1bMe4Q}gb}|DPi-gc3Fp0W4-1Ie-Tcy`Jg(9?2IP6RdPISy8*aC7SCmS1Wgk7p z)QKjKnJ9lQ(54mcRxFasBUH|Ztn6^ihBrz5I2_<-0Wg7nGFb4*@T@X$mLdpGMTN#; zP(^{`uxeQ{AG9$fg>;$g8^@gm;_Xde?Y29)NCD+%u2b|G#+)(bd#gfsx)ym1Qa z9f+L@H)M2k9Ok*-L=`_NvUFR|yo3i{CKAD`sf9cB_(B+UZ3GJ>UtpMn4B8TqMY@sU z2c@)-Nwy`#VzsDYu_>+Ns0!dpB-3Lt3P@f=$;3G>!QTkUHA*GK?<86b(T-d&)-W11 zzQaC}YLcpPmtcj<42$F8H~&K8k?JBXx+lVJC|uYXQ7j!AhB1css#2=q$Bj1mNin)! z5q%(a@Tn)%1=)cs5b7|;A{+fS+HWqa`yIN*4As%&qAz7hX%!N1WrL{+ zRHzx!Y9Fe}T2blI3eq-FCcNP@m3AW37SUI%pw0mp&|t<3D<#s?;ws)Dl@yy$;Jx!g zdxMbQGH;yzh9kNhffmd`($=Q=U*3XPvZ>;zgsFIF{m`V!)b1 zXF%zD=d9?Y9!|Mcq*e7{ew9Y9)Ix2WV34e2dR~%osByY+m~p@d@M~q7m+=qd>uxEQ zh^$w=rzxZ)78@$O|GvIfzLswiGRaOR&0TOjl(7DKmtw7H7X?BCxwlNeOu9_GT=Q1t zMjN}~A%_Z4VPryNV`R)%&XpJai~Unvm6rP7Uos?pW3*#@kVKVamIRv=Q+ik3egt6g!s>X0wJfT?xZgdyKg)pg`_f6s$Et=f&-x>)z4niS9}|eviJOUu zh}DTxI6J{a>2~RF(wEa$IlF2tbkLq5=q&53KGSDUO55sHGoz`A;LRx)aWDLc7g5(`S&xvF`)@YXhme<&)^#{e_~c;*H-EZb)eL^4LSz3v#y-_C z?J*EZFvKII3FBbg#f`o((0c@XV{M_nZ#!}kiO!3zdPuf-E=#+Qq3>p2J{mf~7lL6H zLZ?B;g?dt_I95~ENCOXt++p>pq(z;Lqh9&3s8zQP+m4xW!HLuQVF%sAFKYCHE7j#WF#k{Ae+Pb z9){CavV6B<*WiWwGQwCwW5{v*YA4fb)VAIsRXoc?73k7$*)`o?>=8YIzfUV5LmmaB}CRd_X5$CwFvuH0*f+m!;QTueo)%q|r! zKOH$7Gb?G@-F!4=*k2QB9ur(I&h|a&Tcu(X2UGoIDEufOy} zrVK3xwJ@ALKS&xZ%kgnPZZbE1)A7Z6a3f9l&g|o6-lNfd+f~!Gr6G6A&qc-Ivt`Y~ z`wvfZk4<*F;?_464;7PgqJ-Tapq%V@(At`<%pC-NJ-%*%Uznj6b~n^UzOs$ajofn@ z)E<@<7Z)>fhL{2_Y*-b5-c=jmxh4LmO?zp4{}=fw`jdb7DYoFizv0rAPxgzW0(B{V zGT`5MD!@xG?k`v6f`z~MDyYBtD(9ZvrLS_)^NQF1acAWU{a^e~;Dvwl&mM`Mf+$8C zJ9^`1yp>K?y~!e$CV!7t1u_DL#$p!Yll#75wXZPfLwKSWJw#&=3-HRt-(qZ#alahU zbKk*6RSGjfC}!*2=esr1u=_8dwg1xnApLqPO;NN8?Ma;JtI z+G_qn0%EZ#57Il1MdngkDkP$LFhQ`iN4}HGLyZxtN@_cR;<9p7jqc}mC!8H2eBQy;?z~ra*Jh!NbZAg z2E|pJ;sch`992|tu*H!a=-*gw@GU;lq1p&-s*MstZ8b0+YYUCg={virTMQ1q4(!Fo z{s0#As~i4rfCXJf8~y_Jf`$J^VlR4L@%le4v7oCk$}bWNx{Qnb7pY>Qb6_cd0k#7i z@;AUoFC%*}ij#m(=5;mI*s{xqI0Som*L@kP)7HDG^L4_6ZV|7ScEe)2@z_LGhCc-0X& z5X^b>i}BPRdb0rUiTGWNN|FkXJbv>ChW63735;*uW?bbY0}J)p-jO-)-RZo|cQ|}o z=lw!|EV?v>r@|osWBc5AdVX#^jUlxMAa+^s+<#Z|f%@EdN}GRhZam?g8&4e3;#1xE zSX*+WI`od%?|7SG^+>?se}n9mcmlcp@*Oy?^z*+K**~o(j(=-Car~+t|I@md^Vbu5 zi7e-(eemCOldb#xCacE-j<^o&#m4^7#hm|OJ#k*D-k+%%&P$EMh@(lhnvZ6mSZg|eaJzNu8cHQdR(2N@@|N^EVYE@tx$#*JTkPxCsZ zkV;qzg}jzimqqYJ;-eruS!(Kc`WBt9brmGAxd1Cod}KprwPs%CB#y>>-1}|sUTU?# zs>Om>{sz)3%>)ELU-uy@66`D}}!Kq%57BUBoSnoB+_%#CF%{@&e2aVAHfTb+I^y5Dex*h5nv@{*S-EG2vIw_~({> zuI|Tb6^v}53EIEeiH(uDGXQ!dE^@+R7g=W(Ug(BcIC+2oASV=3AP@x2KC{U}IcsSm zYG-Z(fm-owqRuAgIc;uk=-=m)onNqkpoTD;xRHYl#M0aX`Xw}&z^3W~vDEdYBhv82}F$I+C0D1qYy0LP3VI&f~`=5X=r8?HmHACWC+g;2$UaCzuz6tK)j9 z{J@vDwI7$zoHx(KApcN&(4>9e1y_Bvr80T;eb#P#A-`;KZybB;Jq6=p1vVnTP^; z1;>r*WK;{0pf0vM6C6a16+U`Wc`(+8zOleH`fYCiSHWv>1*+ue|3l8kBv%k=M$l@|N8LW3GD zAGTBqmq;DY!uywFf5GiG!jO3g2i6~gDHy|(GE`z(+fLcrTCG%wVSI@~oJOxm@zuci zRAH{kd;7RlpDn;0JyR39+lvZuudx)R9I+!U#cEp%G$m4nkg?uu z3l=Hv8RXF3m~N^J%~eI*!A5-=vl*0TNX6zC#rzQD5TDhhvYKwCY+_yvU|mr>t#eXQIWl8F6?ib^aI-cb$`0!mSnXv}}m$ ztqZF5%Ndj9lC6E1`zKmy98>rJx%v~3xtCR1dotgosT_+VPi)R^k;}7P(RF0YXioW$ ziIau+mj0YNMM)))I8fnTXdblycE{K}PSz=%G?*%I`3o>jqVq<#+<12;t#n7{Z8}I~ zWc)*BKCGi#St%keexyPb_&lci$)TClBGII1+++)PBVPgY#N<%~@U`G1O>raPvTZPA z4CQU);+kdnbybx7L4`uXq z)X{l;__|9c-K>A-sC<^VY%E5t-QK2T>}$EJnRC}Bt*Ft8rI-9nq-So$-dA~zmXI9V zy;Odk4#$JX<|rTW%ZlO6?$fEc_kK9Z?!&KmZC$>17(If=mda&Ve)H6th5iiF8V@aj z1#7evr)#EWlv7lX?g?04%R0SEgG`gIf@b<>4Fl+rf-a@1w@@W9}4(Sjw z@bQe8O)d?+bN5*5$AQlP#OW%0{fBUwu4ub~7{SrGZUT1$#T^K{3;-IX0fRyMsCuwO zk?p*i`75UbYXiyyVtlybG|wx7IXa|#@k@~e(Zx61+V#gVD_iHf8Jz^wD6CNQ(7l5y zN<$HOZm=}WJ1AlnDHY| zO>h+2J{3lZQn)5jZ^&M#2tj6$Z!62r&BEdl7Y=XMCDcw1AP~DcL(U!`loq7{846ve7mlu>`$*UMNSufc&J`!J1E=ri`*ou`vun$%U=?h^M zB_&)C)q?5S-qH_=aDZ>7T&z<#&Dwxd{fpL%PFFF0#TpJrq6z92kohV7Qr;u18z>{UG+8oyVano z`99ostyGL>9(D4uNK9W?qvu_5-9IsB2Gn(ISjQs^ej&?)k&yc!i$N4HrnLi;CP~~) z?u1&&k;gKpQjeXO`CiFUjY?f|L^(>0$u-QX5NAzvm);ZA8@G~sJ!Xw!je; zpo)=^DrH8ekCZyck=5&E1IwRD)}vs2Z{p@At>=wnq2oBy7p4&fsyk6j6sh>qa;tLO z3tKOPgpP@ic4CBs3m)z9vSXTIVU`5AO-G#;;7<-Tn#J#hYU1#S53Mb`FeC-dI@}RfrQO{onKx=yZ`GX2lAs?CBy>Re^ac5#q zDwh2~*XvG+jauSocZhOhdkStP=Q6;p>s!l4rPYmniJZ~VNT%!_)v(r?3%6hH$h%br-EUVy2wF9>t>ZBy)BE?>F}m+>#df!y`Bhh%-!BG zk?<+&E{mu6#kc+e3qw+>A(n?H6pMDn$n+EFZWX)A z+QRZ;IXRns6;%x1g)0hureU0m^G>Q{PK$voUkY{v^z{QEK9RpSspdE_MX}#Mj+_abnkTDHr`pzecCYeib#U; zc9UGogH*)N%}gc|oyfW5W`k^)W4=CA(`TY}N*gITr?)o3w@+cPU}3F!C*Ne4hel`( zYSk$n&J1w1NAWE5OMl@a8OV+k2oe&X@)*_7@2S(60p`!*a8;N+FIKe+C7=*{EwpJc zR74)mA9OmR;XEb(Kxa(L>x`?a@Oh=AvFaRxD1(hlOXZh>eT+FO0cKl|u{}@grm7Z} z57~J(BVM1aM6~^+s?uUQkvt&Jg1C>XVCQIunBV}FPqqN25o{u2)^PJcw*`Ji!Ba}L zVzi3U`PMz;q??5?m%UO^Q#{|b(?QrTrTBxBq_yYot zgJfUIK>v|z1uhrjgqmx>kgITGKcK6B$KOZPQavUGUsT*?Ha=On2Cs zy2hvJpQZZuzEx?uwG>)xKfU3d^N!cTc^@#m8f@O`o}!W2Ha1GztF3PP?8}U;EsCw! z%b;bLhh8N0&A~9pdI;U{1kKgG_*s%S?2^T_x$oyl^R&#_k@6yx zzazkQ`!0q|Fpb$J{uyn-=rkEaq#nzboEC7BVSu@?0FT9bbU*NgPvc-Wwgn3vmF@`| zrNE94*8Vg>+pLcq5??B2$2yXzCnDC;VaPFOL0II@!pJdA zY=IPObuKV%Nfuw>;o25`1OvM;Me0AUVrLK7a})DomuCT}1l9?}Ja$e2y50uJ$44w= zeqV5FC+GkthQe*SaG36^3!`4|Sx-KM-T$Z;yO?pSEXa68c|Nvpe`G>yl@jHXNi5^h zq0^?%YNm9U0fj8R`9raj!=mlRh}T>U&1F5+@`uZdV`CKE)ZuxIB}}`wj+lqq7`F=> z3H@G#&iQiglM@L|#X)ZPiN=?}!w|$F5Zyll?P$qW7gZ-41gr9(8mde(WQM)wW8&8| z8RUr#Ph)vF%|i4NxFxIySFCsFtwDuh(Gl+oUlf89 zQjosmqO;o5TTxa#hmVNfoJrQ+zty;CCH>{o{IX_%@An-}BDY0)hHqyd-Azw!f|xxR z4!iotbg>3#4oBdJY4#Z;rY9+H!^Zb85J{Dq7JOo+zq8yXpVc&c4@Odwd-hG=`xn6p zlrM>Que*o)9Cn{ z@$$f4ATyX*UcktZGED*)7I!0pbI{LR?+F7Vqjs^wx>k*4E{(ROHl=G>?~_|gHqvp_D811e0{e2>9c$GOU?=4ls5hAH}D5NfsM z|MIEEdS;u7p(9TEXa9r~vfg*8h{$png< zW0Njn`&!>a5&2KLn%tfzWJnteeV4ZMd~=)0|NBv&=!e;9t=;ZuNyR#>rzu6fTjkE_ z1bA!{W4kLPdSzX!#PVe2pAiC(nK~ilNB3wONqF(@F2^q?iWO1ex=Jy{$uOW86@mm* zV;LU@VQ+HRs=OV5z2)v-TV~3}2v=}KNTN>6K;`jO%81QV9a|$P9S$3eP+>ST&uNx+ zq^stJKiD1C`GM+1`ZQr#uqWjj)#rV?yyshoU&IBX#|MtJVrek9cax`va&Dd}R7Ht6 zu7Oq}HQ3uNS&Pw3X%ef@DVs517f(q}T-jP%@T@ou-D2|E*S-YwXyk_{IYoP@q zsMNtGvK+>UGA2y<_ENbpu3>RCU@BAB3gz?RyX=}&2vlAj>nf+~Ylg%KAHA~x1=f)t z=*L)gmvCeALW&c<7$V0cy|ba}4AhN51OgResSPPP0!_qa1V4%yH3l_h7m-x;W02iI zh!Ty@Xz!3vOc@r(i;u`*H7oWMDu*E{Qh3$%)=?}EgO`#PSv+GHpRW2fU#8)(F=a|;dbX4^uSt|H3#X=QrfE3G$IWR}YI1eb8()_;Ud9>E)-D2ZI@dPspy0`P+ zoA6!=7!wgSIep!Me#Nv*ETAVm# zcpMW0-dPYxSY;)-g|LJ{J%ZhcxQUU6r$N9|py6g~2X9n|41J{5-2ffEGpvY+H@o`f z*_lS94`!Xc+d1kgYs&kh(aApxH92=eo+i|OKkY_2kwZE?G3m8!+zw>NP|Rg@C`{-! z{gz0dV@JE)vnl~zBd>YWGZI6h^90Q%hA-KS?VYy0mbw0_mavESK1M*PTJr50NtNfB z$SG0@eGT#BJem#RPK!tjlL>KK!{5?C?r=MFc{6GE3tt!=5KzDiuQX;mbnkzvwjP+5 zsNqW1cx@)ZFPetGPK{cQJ~JJPXp)ynZRU`PqZUvVOZ!woETej@-^Dl??X#nSC(gdI z7Xc>e%m=R0pQxhjg;BbvM&Ag zo-Fd0XFgvd3gs;9-VMKfW?g(pu#0i8&TVnjwk$n6f4@-u`BvlpdV+Nsh(kq>a~X0x3Nk!iYg8yV!kH8^tVdsW zB3FXmxDiYT(JS|?pl??^n9v^wZZ!Fxo$@hwJUM(w=f5g^82NU6{P^&r-bP&l!xUP) z5bg3%7uLr%k`AJm&o|gw?-BU3=?RH!W_%NDtWHh$*|LUyP)^Q}264l+qHU>SLt za+!N^`;^-V}+OT|9?!B#5UP=0I&kRvW9NXLoP$(q{vX-cX z!xNBRRXvaGL(^Gov!?*`N#3bO@TWAg4pxgXQ z`K}TxH=uGiI_7b@!@4EvqePfk3LF$4W~E-dPHufR#$fU7LsR@rs&hs|h)yl;m%sz{*H}FPuFbHHJ`QAm1MyW-2?Y8p z-2%Q!g#S#9o_Hf#BDMMz@c+|yp+naa%+3$ zQxo@ohs~6?L|8OQBX?Atq8G*4RV=_p~kc1X2(YCr}ggQ76Lin{60gBzc{(D(G19& z1||DHP3}HAkU&BSC3e((&DQjFxPb{jGhZ#kvhv;zlWH@M(@9vrxWO{@jL9)vbYR#6 zah;vcozQrq59SG5VbZ;B^^U|~0t>YmoiI-pD^by;thP<%29iB&wQRl#g~`AM_&r!P z$_g@naGRSHV#YDglmwAYHNR}EMz2iDW|-{zGU6I^Vd^Hx_jv@lu8&zp>w}hwN=e=1 zMwqH3vYQ7>#u+8iP7`OjFYOro1}HyD-pt=w$*L9~oG5%Rt0BYsFjt=I9sW3QJt3ZI zJ^xmeiSoKyG_8n$AW8Bus^#!@>~7+s)H?M?5oQ-2Orov=@n}2Cew(OIF?SC5ID@$k zIy$6~(#aD%clKp%Y0O?a=5*eaD7+t9=|m?BSyour=?Zy2P5Q3WkbsxC@Fbp~kk=xV zlPIfUx+dPDQG`q)-1Uzn$`gXj@ULLKD&CHE*nx7+N7C6ZHwci)1>{kwLv9ctpw*>(Ng2WyjtB2 z+*Xthz?{d!?HDaOR`d<+!f|#vO3kYKwi5JedP)OT4~CR{?KK;@3q0V%xdVxn-1ptb zM;s!UG2QlvVYQcQ;91Jag3*JA^=X!r8*yU<#_gXs(ofq39^>@`o_T2=-k6%9RIHtw z`K<0w{#{9KFZfwP=Oi|TwfJ6Lq`Ixk^H##L*Ea5hTGmkx5x`L$lPAwQwOF?BotaH` zX8Tq?-aD$|huE73JXzr?+w0LdR^weeu-H`Q5hRf%EyUfHeT&zgNZCshCgY2?XDc3E zJ}iu%lb={keroUl5#P1r@wC=59C2ae8Vnf?n&1t=6uU>q>xiR4bNj@^Mcm^y4XEao zBE;bL2B{S-VxM;TmktIQCglPWBe?`QaRz643+;nH1<#62?amF5G8Xf%$)<4Ol=aNEDPU6# z)&=u@yGv|#k>bQB8?Da+$9qFZL^kUD5M_?H_*}hxU!;Thcaipd*GvqlQg(n=e)|LV zgWr`Cl}XF>-EcNcyvFtW-p%ygM4Ni8ZDJ4*BA^qfBE7M$z|Jn#KhUy{wohcfs(L7N zrhTe&Dx4^BGqn(yfVG!@atp$};^MEZ1uQd?w?S)+`2?wS{5E~3uj@3iG3hwf;BDms z_jV;p9kQ}xibo5L^|V}X6`VsRyf)$YiTL}&bV1w+O!G(eg3~bW;a*{%mkxF?zkJ?a4Gf~9BgYkf zTL0v8_6gkG|h1O00`i=ofraF|r(R!Rwg#^7juswM- zyS!R}2Z{;Uc6@-lo)HA^|4LQ#TFnn&34bLrM4jELdHVi^i&j4x(;wNB@RX4^? zR${iTmnSc$Gi}C5mqQeYqHC1)n48rJAYJX|Hj^Gurbdt!#9 zMtQHKj0t2zgDm|8P8voSqFtVqng3R*5g?WTn(4M_p&ugaap~gF4u9!~kBj$KUsA(h zm;Py|2HRKBmY)1W$lj9@HsD^Pyf?8+G0siX%BJ?)Wy}Dtk;${@Q0{pKM$_AN194hv z7Nn$hcR+j^CA9U3NX#RIMEYRyZI2NbL_Xt}JtQ`&8Oh&XX5TFy!lcmc@g~fFT&0Sr zFqt-8RQqXodiWN5?L2YLTclG)0`{XjEK24K~&N>U7f86vwDu+ zcFjUs1J0Q&J7US8+xz8s3cWrBk4qoFjm1+q9!))2A^?6b%MY%#_g?Yt4qkSIOKtU2 z!OSe3w!+ARv3cQ$mZVznj8tG@hvRMyj)#>RhF7qANnV-I~Aiaz587Mp|sAXAHbOhw`aZ>;?YqF=8*t=eBz`z);?OUi&{zg`<-!iu;O|DdBXde7fIg}vd6G?i4&JE7Ml(Na`t*oz1F zHDD=Gs7TxZr>LNe0emq>vvliuwM@TdZl0za_dJcI7qU*@qo=%f%wLZ2d){jb zO`HM+>>1Jfb8fZcITj=k>cjNn-Dnp+J0sNR30rt8$FUi`c%z%~k!uFJrU7lRbx~WD z`5knxMHxDvE?TdCz=Gr)MZ0SPVMPmPd&FWr>{D0^f(&$SecIrZyFm+us>X*6Jnv=E zxnaM$-e~V>77k8PG_L>l9=$RYI&6|iW1FeR2*mi1PSt3fnq=zJ~~>j_Hc<&L(znzSQih2>b33zYU2)Pr;!WJ}ls>1Th9IMd~haHqUsR zAgSRT8+3AM-i94X&cn`<0^6*eG!9Uf#|oIXHM)?Nhic%FN-I*CI=&ex=^G{;wlb>Q zADQc*Yk4vlNVB?~6@&iuz4)Qsx94b%o!Z>9)7m<^eEoL~4{t>*JezM}uqrL9DAQ3< zBKWw4x##-)MjQu_WZpliFvdV?qrT7dgg|oF*Hy(#@Ne9Jt6~`tH<0(L>d`Y? zk+`nPaOj~`F91bVfEKiRhn1HV2vBx4a&qwmK;KJnQFG~zq7KU-c^iTyI@m3MOwK+o{YGhx3x zV4`N}0xfd6*ebN*Pt?`L!rlq`9)=&q>(EjmD~O2;fa^y&%6UDQsjCU})fN|gfVLNh zzR}{mDv0Z{6y@UD(%w$u{2d8gmp-k$z3Fe)a*+G(_D;}3F0M-#S5#E#F5eXumFtJA z`qL}%aDtejLSGKSjmrMl540~BS}|+}_(2151KFXoo&NyrE@|xSAa3YTztgzE=TCF} zPJ=dt?(Yv82wJW5Cyj##3jgozIG|YkN#o`@=htuTfE*m>Pk;PIV`t~&fgZcxX&fN- zYwP0xpFgPdXS;I<{-AO4UT?>BeSI9fyx043pBHNXF&;0p;OP$o{ z*V}>rW)2>n>wUSd;|soi4%orG*XQ8kfY!kMu@)|NXcgI?G>+?h;rbiibMb(#?Sq@^ zZ~B6{ukC}2=Q^*rc+X3{f9Dl9i1*r>x!KQ4bpLGkH!{e1T~@iFdiPpip7W<(e;<#B z{rZ}DIL;sA`mG&Q-mc97go^K<`v7uq@?58JU#D@hU(*LbE}nB$_4~YBJlFej@m`Y! zAQvzAPkf;??rUq|;=RUaAUFGYt?Tdea true, :output => OUTPUT) + assert_directory_contains(OUTPUT, ['leading_zeros_01.png', 'leading_zeros_02.png', + 'leading_zeros_03.png', 'leading_zeros_04.png', + 'leading_zeros_05.png', 'leading_zeros_06.png', + 'leading_zeros_07.png', 'leading_zeros_08.png', + 'leading_zeros_09.png', 'leading_zeros_10.png']) + end + end diff --git a/test/unit/test_extract_pages.rb b/test/unit/test_extract_pages.rb index e0b1015..42e5a81 100755 --- a/test/unit/test_extract_pages.rb +++ b/test/unit/test_extract_pages.rb @@ -24,4 +24,17 @@ def test_name_escaping_while_extracting_pages assert Dir["#{OUTPUT}/*.pdf"].length == 2 end + def test_leading_zeros_while_extracting_pages + Docsplit.extract_pages('test/fixtures/leading_zeros.pdf', :leading_zeros => true, :output => OUTPUT) + + doc_data_path = File.join(OUTPUT, 'doc_data.txt') + File.delete(doc_data_path) if File.exists?(doc_data_path) + + assert_directory_contains(OUTPUT, ['leading_zeros_01.pdf', 'leading_zeros_02.pdf', + 'leading_zeros_03.pdf', 'leading_zeros_04.pdf', + 'leading_zeros_05.pdf', 'leading_zeros_06.pdf', + 'leading_zeros_07.pdf', 'leading_zeros_08.pdf', + 'leading_zeros_09.pdf', 'leading_zeros_10.pdf']) + end + end diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb index 00d24e3..79a1f32 100755 --- a/test/unit/test_extract_text.rb +++ b/test/unit/test_extract_text.rb @@ -54,4 +54,13 @@ def test_name_escaping_while_extracting_text assert Dir["#{OUTPUT}/*.txt"].length == 2 end + def test_leading_zeros_while_extracting_text + Docsplit.extract_text('test/fixtures/leading_zeros.pdf', :pages => 'all', :leading_zeros => true, :output => OUTPUT) + assert_directory_contains(OUTPUT, ['leading_zeros_01.txt', 'leading_zeros_02.txt', + 'leading_zeros_03.txt', 'leading_zeros_04.txt', + 'leading_zeros_05.txt', 'leading_zeros_06.txt', + 'leading_zeros_07.txt', 'leading_zeros_08.txt', + 'leading_zeros_09.txt', 'leading_zeros_10.txt']) + end + end From d1d65658cf52c30f173f980d3991851119fc6525 Mon Sep 17 00:00:00 2001 From: Justin G Date: Wed, 22 Jan 2014 13:25:00 -0600 Subject: [PATCH 2/3] Add the command line option --leading_zeros to pad the filename page number, allowing numerical sorting --- lib/docsplit/command_line.rb | 3 +++ lib/docsplit/image_extractor.rb | 11 ++++++++++- lib/docsplit/page_extractor.rb | 11 ++++++++++- lib/docsplit/text_extractor.rb | 16 +++++++++++++--- 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb index 60ee7ef..ae52dfb 100755 --- a/lib/docsplit/command_line.rb +++ b/lib/docsplit/command_line.rb @@ -101,6 +101,9 @@ def parse_options opts.on('-r', '--rolling', 'generate images from each previous image') do |r| @options[:rolling] = true end + opts.on('--leading_zeros', 'include leading zeros when naming a page') do |l| + @options[:leading_zeros] = true + end opts.on_tail('-v', '--version', 'display docsplit version') do puts "Docsplit version #{Docsplit::VERSION}" exit diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 8c29bbc..510e14c 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -32,6 +32,7 @@ def convert(pdf, size, format, previous=nil) basename = File.basename(pdf, File.extname(pdf)) directory = directory_for(size) pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s + page_format = page_number_format(pdf) escaped_pdf = ESCAPE[pdf] FileUtils.mkdir_p(directory) unless File.exists?(directory) common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" @@ -41,7 +42,8 @@ def convert(pdf, size, format, previous=nil) raise ExtractionFailed, result if $? != 0 else page_list(pages).each do |page| - out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] + page_number = sprintf(page_format, page) + out_file = ESCAPE[File.join(directory, "#{basename}_#{page_number}.#{format}")] cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 @@ -63,6 +65,7 @@ def extract_options(options) @sizes = [options[:size]].flatten.compact @sizes = [nil] if @sizes.empty? @rolling = !!options[:rolling] + @zeros = !!options[:leading_zeros] end # If there's only one size requested, generate the images directly into @@ -98,6 +101,12 @@ def page_list(pages) }.flatten.uniq.sort end + # Generate the appropriate page number format. + def page_number_format(pdf) + digits = Docsplit.extract_length(pdf).to_s.length + @zeros ? "%0#{digits}d" : "%d" + end + end end diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb index 1b9bf7f..9815c63 100644 --- a/lib/docsplit/page_extractor.rb +++ b/lib/docsplit/page_extractor.rb @@ -9,7 +9,8 @@ def extract(pdfs, opts) extract_options opts [pdfs].flatten.each do |pdf| pdf_name = File.basename(pdf, File.extname(pdf)) - page_path = File.join(@output, "#{pdf_name}_%d.pdf") + page_format = page_number_format(pdf) + page_path = File.join(@output, "#{pdf_name}_#{page_format}.pdf") FileUtils.mkdir_p @output unless File.exists?(@output) cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability @@ -29,6 +30,14 @@ def extract(pdfs, opts) def extract_options(options) @output = options[:output] || '.' + @zeros = !!options[:leading_zeros] + end + + # Generate the appropriate page number format. + def page_number_format(pdf) + digits = Docsplit.extract_length(pdf).to_s.length + # PDFTailor doesn't support printf-style format in the output, yet + (!DEPENDENCIES[:pdftailor] && @zeros) ? "%0#{digits}d" : "%d" end end diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 0d55f32..6a1ab39 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -59,12 +59,14 @@ def extract_from_pdf(pdf, pages) def extract_from_ocr(pdf, pages) tempdir = Dir.mktmpdir base_path = File.join(@output, @pdf_name) + page_format = page_number_format(pdf) escaped_pdf = ESCAPE[pdf] if pages pages.each do |page| - tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" + page_number = sprintf(page_format, page) + tiff = "#{tempdir}/#{@pdf_name}_#{page_number}.tif" escaped_tiff = ESCAPE[tiff] - file = "#{base_path}_#{page}" + file = "#{base_path}_#{page_number}" run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1" clean_text(file + '.txt') if @clean_ocr @@ -109,7 +111,8 @@ def extract_full(pdf) # Extract the contents of a single page of text, directly, adding it to # the `@pages_to_ocr` list if the text length is inadequate. def extract_page(pdf, page) - text_path = File.join(@output, "#{@pdf_name}_#{page}.txt") + page_number = sprintf(page_number_format(pdf), page) + text_path = File.join(@output, "#{@pdf_name}_#{page_number}.txt") run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" unless @forbid_ocr @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE @@ -123,6 +126,13 @@ def extract_options(options) @forbid_ocr = options[:ocr] == false @clean_ocr = !(options[:clean] == false) @language = options[:language] || 'eng' + @zeros = !!options[:leading_zeros] + end + + # Generate the appropriate page number format. + def page_number_format(pdf) + digits = Docsplit.extract_length(pdf).to_s.length + @zeros ? "%0#{digits}d" : "%d" end end From fd480b3df101c83e1984168da86459e3c3783083 Mon Sep 17 00:00:00 2001 From: Justin G Date: Wed, 22 Jan 2014 13:26:42 -0600 Subject: [PATCH 3/3] Include documentation for the --leading_zeros option --- index.html | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/index.html b/index.html index 981c34d..2a70873 100755 --- a/index.html +++ b/index.html @@ -172,7 +172,10 @@

Usage

The Docsplit gem includes both the docsplit command-line utility as well as a Ruby API. The available commands and options are identical in both.
--output or -o can be passed to any command in order to - store the generated files in a directory of your choosing. + store the generated files in a directory of your choosing.
+ --leading_zeros can be passed to any command extracting individual + pages in order to pad the files' page numbers with zeros, resulting in + numerical ordering for particular environments.