Different Extraction Format for Apparently Identical Tables
billdenney opened this issue · 0 comments
billdenney commented
Please specify whether your issue is about:
- a possible bug
- a question about package functionality
- a suggested code or documentation change, improvement to the code, or feature request
I believe that this is an issue with the underlying java library, but since I'm using tabulizer, I'm posting here and will link there.
I have a file with two tables that look generally identical but they extract very differently. Is there a reason that the table on page 33 extracts with multiple lines in a cell and the one on page 34 that visually looks the same extracts with one value per cell (like I would prefer)?
library(tabulizer)
source_url <- "https://ir.library.oregonstate.edu/xmlui/bitstream/handle/1957/39168/KarnprachaChanida2013.pdf"
extract_tables(file=source_url, pages=33)
#> [[1]]
#> [,1]
#> [1,] "Sample"
#> [2,] ""
#> [3,] "1\r2\r3\r4\r5\r6\r7\r8\r9\r10\r11\r12\r13\r14\r15\r16\r17"
#> [4,] "mean\rSD"
#> [,2]
#> [1,] "Parameter"
#> [2,] "Cmax\r(μg/L)"
#> [3,] "196.6470\r164.7870\r305.5020\r31.8423\r89.9160\r188.6820\r205.3200\r111.8640\r29.3820\r172.5750\r137.0000\r97.7040\r22.4967\r71.6850\r18.1248\r351.5220\r70.8000"
#> [4,] "133.285\r96.836"
#> [,3]
#> [1,] ""
#> [2,] "Tmax\r(h)"
#> [3,] "1.0000\r1.0000\r0.5000\r4.0000\r1.5000\r1.0000\r1.0000\r1.0000\r4.0000\r1.0000\r4.0000\r1.0000\r8.0000\r1.5000\r2.0000\r1.0000\r1.0000"
#> [4,] "2.029\r1.932"
#> [,4]
#> [1,] ""
#> [2,] "t1/2\r(h)"
#> [3,] "19.4907\r16.1626\r33.6797\r18.6926\r19.2421\r12.5988\r16.0397\r18.2625\r16.0259\r15.3051\r15.9157\r13.9282\r18.0150\r14.2783\r20.6300\r17.1899\r15.7772"
#> [4,] "16.953\r4.635"
#> [,5]
#> [1,] ""
#> [2,] "k\r-1\r(h)"
#> [3,] "0.0356\r0.0429\r0.0206\r0.0371\r0.0360\r0.0550\r0.0432\r0.0380\r0.0433\r0.0453\r0.0436\r0.0498\r0.0385\r0.0485\r0.0336\r0.0403\r0.0439"
#> [4,] "0.041\r0.008"
#> [,6]
#> [1,] ""
#> [2,] "AUC\r((h)*(μg/L))"
#> [3,] "916.4590\r1045.9808\r965.4304\r720.5525\r1583.8412\r862.4255\r1066.7322\r756.7529\r432.1423\r760.9732\r920.5969\r859.4208\r402.3563\r871.6733\r292.4521\r1779.6286\r426.5237"
#> [4,] "862.585\r387.896"
#> [,7]
#> [1,] ""
#> [2,] "AUMC\r2\r(h *(μg/L))"
#> [3,] "24149.6147\r22147.3277\r42981.7409\r25150.5644\r62344.9600\r14429.7069\r19017.5640\r18046.5471\r10393.0142\r16444.2768\r18683.2467\r18993.2503\r11488.7713\r18977.0425\r14150.9482\r29230.7834\r9245.2438"
#> [4,] "22110.271\r13100.512"
#> [,8]
#> [1,] ""
#> [2,] "MRT\r(h)"
#> [3,] "25.3132\r20.1290\r35.5532\r30.8446\r36.8007\r16.2977\r17.1660\r22.2495\r22.6083\r20.6046\r19.3564\r21.8207\r26.1951\r20.8927\r33.3108\r15.7553\r18.8330"
#> [4,] "23.749\r6.636"
#> [,9]
#> [1,] ""
#> [2,] "Cl/F\r-1\r(L h)"
#> [3,] "188.6725\r163.5964\r148.8906\r220.7514\r106.2496\r203.3020\r162.4747\r221.9216\r391.5606\r225.5397\r186.4857\r206.7958\r410.4117\r198.1703\r423.7134\r97.0194\r366.6686"
#> [4,] "193.813\r102.904"
#> [,10]
#> [1,] ""
#> [2,] "Vz/F\r(L)"
#> [3,] "5305.3049\r3814.6961\r7234.5318\r5953.1646\r2949.5466\r3695.2513\r3759.7280\r5847.0182\r9053.0953\r4980.0446\r4281.9959\r4155.3990\r10666.6630\r4082.1691\r12610.9129\r2406.0638\r8345.9882"
#> [4,] "5831.857\r2853.276"
extract_tables(file=source_url, pages=34)
#> [[1]]
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7]
#> [1,] "" "" "" "" "" "" ""
#> [2,] "Sample" "Cmax" "Tmax" "t1/2" "k" "" "AUC"
#> [3,] "" "" "" "" "-1" "" ""
#> [4,] "" "(μg/L)" "(h)" "(h)" "(h )" "" "((h)*(μg/L))"
#> [5,] "1" "34.0000" "1.0000" "6.2258" "0.1113" "" "71.0204"
#> [6,] "2" "115.0000" "1.0000" "28.5859" "0.0242" "" "316.8011"
#> [7,] "3" "19.0000" "0.5000" "8.0000" "0.0866" "" "83.3601"
#> [8,] "4" "21.0000" "1.0000" "0.2090" "3.3165" "" "11.3760"
#> [9,] "5" "84.0000" "1.0000" "4.0870" "0.1696" "" "177.1745"
#> [10,] "6" "77.0000" "1.0000" "2.7855" "0.2488" "" "185.4200"
#> [11,] "7" "71.0000" "0.5000" "5.7580" "0.1204" "" "139.9458"
#> [12,] "8" "70.0000" "1.5000" "2.3583" "0.2939" "" "185.4150"
#> [13,] "9" "29.0000" "0.5000" "16.8560" "0.0411" "" "131.1147"
#> [14,] "10" "21.0000" "0.5000" "1.9261" "0.3599" "" "25.1871"
#> [15,] "11" "31.0000" "0.5000" "7.1459" "0.0970" "" "51.4741"
#> [16,] "12" "30.0000" "1.0000" "7.5987" "0.0912" "" "54.0282"
#> [17,] "13" "4.0000" "1.5000" "3.8289" "0.1810" "" "9.5000"
#> [18,] "14" "12.0000" "1.0000" "7.4071" "0.0936" "" "44.2087"
#> [19,] "15" "13.0000" "0.5000" "5.7177" "0.1212" "" "38.8940"
#> [20,] "16" "21.0000" "0.5000" "20.4606" "0.0339" "" "140.7535"
#> [21,] "17" "36.0000" "1.0000" "4.5369" "0.1528" "" "85.0728"
#> [22,] "18" "53.0000" "0.5000" "6.0518" "0.1145" "" "79.5132"
#> [23,] "mean" "45.313" "0.781" "2.060*" "0.336" "" "111.034"
#> [24,] "SD" "29.570" "0.315" "7.569" "0.800" "" "78.842"
#> [,8] [,9] [,10] [,11]
#> [1,] "Parameter" "" "" ""
#> [2,] "AUMC" "MRT" "Cl/F" "Vz/F"
#> [3,] "2" "" "-1" ""
#> [4,] "(h *(μg/L))" "(h)" "(L h )" "(L)"
#> [5,] "1105.3174" "12.8630" "232.7478" "2090.5190"
#> [6,] "19035.9680" "43.8855" "46.1080" "1901.5285"
#> [7,] "1149.3662" "11.4954" "200.0305" "2308.6641"
#> [8,] "12.7360" "1.1195" "1757.9327" "1967.9204"
#> [9,] "1298.8885" "7.3613" "113.3473" "668.3335"
#> [10,] "1020.2504" "4.9621" "97.2732" "390.9068"
#> [11,] "1317.8928" "7.1588" "108.6396" "902.4717"
#> [12,] "828.3005" "4.5961" "110.9758" "377.5689"
#> [13,] "3225.5149" "20.7095" "128.4103" "3122.6831"
#> [14,] "102.1391" "3.6089" "706.6599" "1963.6497"
#> [15,] "705.6419" "9.2359" "261.7728" "2698.7078"
#> [16,] "772.7076" "9.5495" "247.1694" "2709.6305"
#> [17,] "28.5000" "3.0000" "2105.2632" "6315.7895"
#> [18,] "636.7942" "11.8847" "373.2659" "3988.8049"
#> [19,] "534.3282" "9.2700" "346.9771" "2862.1779"
#> [20,] "5723.8315" "29.6098" "103.4614" "3054.0177"
#> [21,] "836.2202" "7.4433" "178.0225" "1165.2329"
#> [22,] "655.4245" "7.7132" "235.3636" "2054.9255"
#> [23,] "2395.283" "11.911" "147.665*" "1889.934"
#> [24,] "4647.220" "10.960" "417.248" "928.346"
sessionInfo()
#> R version 3.6.2 (2019-12-12)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 18363)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=English_United States.1252
#> [2] LC_CTYPE=English_United States.1252
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.1252
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] tabulizer_0.2.2
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.3 png_0.1-7 digest_0.6.23
#> [4] magrittr_1.5 evaluate_0.14 highr_0.8
#> [7] rlang_0.4.2 stringi_1.4.3 rmarkdown_2.0
#> [10] tabulizerjars_1.0.1 tools_3.6.2 stringr_1.4.0
#> [13] xfun_0.11 yaml_2.2.0 compiler_3.6.2
#> [16] rJava_0.9-11 htmltools_0.4.0 knitr_1.26
Created on 2020-01-01 by the reprex package (v0.3.0)