ropensci/tabulapdf

Different Extraction Format for Apparently Identical Tables

billdenney opened this issue · 0 comments

Please specify whether your issue is about:

  • a possible bug
  • a question about package functionality
  • a suggested code or documentation change, improvement to the code, or feature request

I believe that this is an issue with the underlying java library, but since I'm using tabulizer, I'm posting here and will link there.

I have a file with two tables that look generally identical but they extract very differently. Is there a reason that the table on page 33 extracts with multiple lines in a cell and the one on page 34 that visually looks the same extracts with one value per cell (like I would prefer)?

library(tabulizer)
source_url <- "https://ir.library.oregonstate.edu/xmlui/bitstream/handle/1957/39168/KarnprachaChanida2013.pdf"
extract_tables(file=source_url, pages=33)
#> [[1]]
#>      [,1]                                                       
#> [1,] "Sample"                                                   
#> [2,] ""                                                         
#> [3,] "1\r2\r3\r4\r5\r6\r7\r8\r9\r10\r11\r12\r13\r14\r15\r16\r17"
#> [4,] "mean\rSD"                                                 
#>      [,2]                                                                                                                                                              
#> [1,] "Parameter"                                                                                                                                                       
#> [2,] "Cmax\r(μg/L)"                                                                                                                                                   
#> [3,] "196.6470\r164.7870\r305.5020\r31.8423\r89.9160\r188.6820\r205.3200\r111.8640\r29.3820\r172.5750\r137.0000\r97.7040\r22.4967\r71.6850\r18.1248\r351.5220\r70.8000"
#> [4,] "133.285\r96.836"                                                                                                                                                 
#>      [,3]                                                                                                                                    
#> [1,] ""                                                                                                                                      
#> [2,] "Tmax\r(h)"                                                                                                                             
#> [3,] "1.0000\r1.0000\r0.5000\r4.0000\r1.5000\r1.0000\r1.0000\r1.0000\r4.0000\r1.0000\r4.0000\r1.0000\r8.0000\r1.5000\r2.0000\r1.0000\r1.0000"
#> [4,] "2.029\r1.932"                                                                                                                          
#>      [,4]                                                                                                                                                     
#> [1,] ""                                                                                                                                                       
#> [2,] "t1/2\r(h)"                                                                                                                                              
#> [3,] "19.4907\r16.1626\r33.6797\r18.6926\r19.2421\r12.5988\r16.0397\r18.2625\r16.0259\r15.3051\r15.9157\r13.9282\r18.0150\r14.2783\r20.6300\r17.1899\r15.7772"
#> [4,] "16.953\r4.635"                                                                                                                                          
#>      [,5]                                                                                                                                    
#> [1,] ""                                                                                                                                      
#> [2,] "k\r-1\r(h)"                                                                                                                            
#> [3,] "0.0356\r0.0429\r0.0206\r0.0371\r0.0360\r0.0550\r0.0432\r0.0380\r0.0433\r0.0453\r0.0436\r0.0498\r0.0385\r0.0485\r0.0336\r0.0403\r0.0439"
#> [4,] "0.041\r0.008"                                                                                                                          
#>      [,6]                                                                                                                                                                          
#> [1,] ""                                                                                                                                                                            
#> [2,] "AUC\r((h)*(μg/L))"                                                                                                                                                          
#> [3,] "916.4590\r1045.9808\r965.4304\r720.5525\r1583.8412\r862.4255\r1066.7322\r756.7529\r432.1423\r760.9732\r920.5969\r859.4208\r402.3563\r871.6733\r292.4521\r1779.6286\r426.5237"
#> [4,] "862.585\r387.896"                                                                                                                                                            
#>      [,7]                                                                                                                                                                                                       
#> [1,] ""                                                                                                                                                                                                         
#> [2,] "AUMC\r2\r(h *(μg/L))"                                                                                                                                                                                    
#> [3,] "24149.6147\r22147.3277\r42981.7409\r25150.5644\r62344.9600\r14429.7069\r19017.5640\r18046.5471\r10393.0142\r16444.2768\r18683.2467\r18993.2503\r11488.7713\r18977.0425\r14150.9482\r29230.7834\r9245.2438"
#> [4,] "22110.271\r13100.512"                                                                                                                                                                                     
#>      [,8]                                                                                                                                                     
#> [1,] ""                                                                                                                                                       
#> [2,] "MRT\r(h)"                                                                                                                                               
#> [3,] "25.3132\r20.1290\r35.5532\r30.8446\r36.8007\r16.2977\r17.1660\r22.2495\r22.6083\r20.6046\r19.3564\r21.8207\r26.1951\r20.8927\r33.3108\r15.7553\r18.8330"
#> [4,] "23.749\r6.636"                                                                                                                                          
#>      [,9]                                                                                                                                                                     
#> [1,] ""                                                                                                                                                                       
#> [2,] "Cl/F\r-1\r(L h)"                                                                                                                                                        
#> [3,] "188.6725\r163.5964\r148.8906\r220.7514\r106.2496\r203.3020\r162.4747\r221.9216\r391.5606\r225.5397\r186.4857\r206.7958\r410.4117\r198.1703\r423.7134\r97.0194\r366.6686"
#> [4,] "193.813\r102.904"                                                                                                                                                       
#>      [,10]                                                                                                                                                                                        
#> [1,] ""                                                                                                                                                                                           
#> [2,] "Vz/F\r(L)"                                                                                                                                                                                  
#> [3,] "5305.3049\r3814.6961\r7234.5318\r5953.1646\r2949.5466\r3695.2513\r3759.7280\r5847.0182\r9053.0953\r4980.0446\r4281.9959\r4155.3990\r10666.6630\r4082.1691\r12610.9129\r2406.0638\r8345.9882"
#> [4,] "5831.857\r2853.276"
extract_tables(file=source_url, pages=34)
#> [[1]]
#>       [,1]     [,2]       [,3]     [,4]      [,5]     [,6] [,7]           
#>  [1,] ""       ""         ""       ""        ""       ""   ""             
#>  [2,] "Sample" "Cmax"     "Tmax"   "t1/2"    "k"      ""   "AUC"          
#>  [3,] ""       ""         ""       ""        "-1"     ""   ""             
#>  [4,] ""       "(μg/L)"  "(h)"    "(h)"     "(h )"   ""   "((h)*(μg/L))"
#>  [5,] "1"      "34.0000"  "1.0000" "6.2258"  "0.1113" ""   "71.0204"      
#>  [6,] "2"      "115.0000" "1.0000" "28.5859" "0.0242" ""   "316.8011"     
#>  [7,] "3"      "19.0000"  "0.5000" "8.0000"  "0.0866" ""   "83.3601"      
#>  [8,] "4"      "21.0000"  "1.0000" "0.2090"  "3.3165" ""   "11.3760"      
#>  [9,] "5"      "84.0000"  "1.0000" "4.0870"  "0.1696" ""   "177.1745"     
#> [10,] "6"      "77.0000"  "1.0000" "2.7855"  "0.2488" ""   "185.4200"     
#> [11,] "7"      "71.0000"  "0.5000" "5.7580"  "0.1204" ""   "139.9458"     
#> [12,] "8"      "70.0000"  "1.5000" "2.3583"  "0.2939" ""   "185.4150"     
#> [13,] "9"      "29.0000"  "0.5000" "16.8560" "0.0411" ""   "131.1147"     
#> [14,] "10"     "21.0000"  "0.5000" "1.9261"  "0.3599" ""   "25.1871"      
#> [15,] "11"     "31.0000"  "0.5000" "7.1459"  "0.0970" ""   "51.4741"      
#> [16,] "12"     "30.0000"  "1.0000" "7.5987"  "0.0912" ""   "54.0282"      
#> [17,] "13"     "4.0000"   "1.5000" "3.8289"  "0.1810" ""   "9.5000"       
#> [18,] "14"     "12.0000"  "1.0000" "7.4071"  "0.0936" ""   "44.2087"      
#> [19,] "15"     "13.0000"  "0.5000" "5.7177"  "0.1212" ""   "38.8940"      
#> [20,] "16"     "21.0000"  "0.5000" "20.4606" "0.0339" ""   "140.7535"     
#> [21,] "17"     "36.0000"  "1.0000" "4.5369"  "0.1528" ""   "85.0728"      
#> [22,] "18"     "53.0000"  "0.5000" "6.0518"  "0.1145" ""   "79.5132"      
#> [23,] "mean"   "45.313"   "0.781"  "2.060*"  "0.336"  ""   "111.034"      
#> [24,] "SD"     "29.570"   "0.315"  "7.569"   "0.800"  ""   "78.842"       
#>       [,8]           [,9]      [,10]       [,11]      
#>  [1,] "Parameter"    ""        ""          ""         
#>  [2,] "AUMC"         "MRT"     "Cl/F"      "Vz/F"     
#>  [3,] "2"            ""        "-1"        ""         
#>  [4,] "(h *(μg/L))" "(h)"     "(L h )"    "(L)"      
#>  [5,] "1105.3174"    "12.8630" "232.7478"  "2090.5190"
#>  [6,] "19035.9680"   "43.8855" "46.1080"   "1901.5285"
#>  [7,] "1149.3662"    "11.4954" "200.0305"  "2308.6641"
#>  [8,] "12.7360"      "1.1195"  "1757.9327" "1967.9204"
#>  [9,] "1298.8885"    "7.3613"  "113.3473"  "668.3335" 
#> [10,] "1020.2504"    "4.9621"  "97.2732"   "390.9068" 
#> [11,] "1317.8928"    "7.1588"  "108.6396"  "902.4717" 
#> [12,] "828.3005"     "4.5961"  "110.9758"  "377.5689" 
#> [13,] "3225.5149"    "20.7095" "128.4103"  "3122.6831"
#> [14,] "102.1391"     "3.6089"  "706.6599"  "1963.6497"
#> [15,] "705.6419"     "9.2359"  "261.7728"  "2698.7078"
#> [16,] "772.7076"     "9.5495"  "247.1694"  "2709.6305"
#> [17,] "28.5000"      "3.0000"  "2105.2632" "6315.7895"
#> [18,] "636.7942"     "11.8847" "373.2659"  "3988.8049"
#> [19,] "534.3282"     "9.2700"  "346.9771"  "2862.1779"
#> [20,] "5723.8315"    "29.6098" "103.4614"  "3054.0177"
#> [21,] "836.2202"     "7.4433"  "178.0225"  "1165.2329"
#> [22,] "655.4245"     "7.7132"  "235.3636"  "2054.9255"
#> [23,] "2395.283"     "11.911"  "147.665*"  "1889.934" 
#> [24,] "4647.220"     "10.960"  "417.248"   "928.346"
sessionInfo()
#> R version 3.6.2 (2019-12-12)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 18363)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=English_United States.1252 
#> [2] LC_CTYPE=English_United States.1252   
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C                          
#> [5] LC_TIME=English_United States.1252    
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] tabulizer_0.2.2
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.0.3          png_0.1-7           digest_0.6.23      
#>  [4] magrittr_1.5        evaluate_0.14       highr_0.8          
#>  [7] rlang_0.4.2         stringi_1.4.3       rmarkdown_2.0      
#> [10] tabulizerjars_1.0.1 tools_3.6.2         stringr_1.4.0      
#> [13] xfun_0.11           yaml_2.2.0          compiler_3.6.2     
#> [16] rJava_0.9-11        htmltools_0.4.0     knitr_1.26

Created on 2020-01-01 by the reprex package (v0.3.0)