pdf-rs/pdf

Error UnspecifiedXRefEntry on reading pdf with only two images

SebastianRzk opened this issue · 1 comments

I tried to read a simple pdf (generated by a scanner, with utsushi (ImageScanV3) with the read.rs example code from this repository.

Then the page is err :(.

With other pdfs, evereything works as expected.

Example-PDF:

Unbenannt.pdf

Cargo.toml:

[package]
name = "pdf-image-extractor"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
pdf = "0.8.0"

Console-log:

Finished dev [unoptimized + debuginfo] target(s) in 0.62s
     Running `target/debug/pdf-image-extractor`
read: /pathToProject/data/Unbenannt.pdf
PDF
page ok
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value: Shared { source: Try { file: "/home/unbekannt/.cargo/registry/src/github.com-1ecc6299db9ec823/pdf-0.8.0/src/file.rs", line: 87, column: 27, context: Context([]), source: UnspecifiedXRefEntry { id: 10 } } }', src/main.rs:39:18
stack backtrace:
   0: rust_begin_unwind
             at /rustc/8460ca823e8367a30dda430efda7
90588b8c84d3/library/std/src/panicking.rs:575:5
   1: core::panicking::panic_fmt
             at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/panicking.rs:64:14
   2: core::result::unwrap_failed
             at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/result.rs:1790:5
   3: core::result::Result<T,E>::unwrap
             at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/result.rs:1112:23
   4: pdf_image_extractor::main
             at ./src/main.rs:39:13
   5: core::ops::function::FnOnce::call_once
             at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/ops/function.rs:250:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.

 *  The terminal process "cargo 'run', '--package', 'pdf-image-extractor', '--bin', 'pdf-image-extractor'" terminated with exit code: 101. 
 *  Terminal will be reused by tasks, press any key to close it.
 

Example code:

extern crate pdf;

use std::time::SystemTime;
use std::fs;
use std::collections::HashMap;

use pdf::file::File;
use pdf::object::*;
use pdf::primitive::Primitive;
use pdf::error::PdfError;
use pdf::enc::StreamFilter;


fn main() -> Result<(), PdfError> {
    let path = "/pathtoobject/data/Unbenannt.pdf";
    println!("read: {}", path);
    let now = SystemTime::now();

    let file = File::<Vec<u8>>::open(&path).unwrap();
    if let Some(ref info) = file.trailer.info_dict {
        let title = info.get("Title").and_then(|p| p.to_string_lossy().ok());
        let author = info.get("Author").and_then(|p| p.to_string_lossy().ok());

        let descr = match (title, author) {
            (Some(title), None) => title,
            (None, Some(author)) => format!("[no title] – {}", author),
            (Some(title), Some(author)) => format!("{} – {}", title, author),
            _ => "PDF".into()
        };
        println!("{}", descr);
    }

    let mut images: Vec<_> = vec![];
    let mut fonts = HashMap::new();

    for page in file.pages() {
        if page.is_err(){
            // why is page err?
            page.unwrap();
            continue;
        }
        println!("page ok");


        let page = page.unwrap();
        let resources = page.resources().unwrap();
        for (i, font) in resources.fonts.values().enumerate() {
            let name = match &font.name {
                Some(name) => name.as_str().into(),
                None => i.to_string(),
            };
            fonts.insert(name, font.clone());


        images.extend(resources.xobjects.iter().map(|(_name, &r)| file.get(r))
        .filter( |o| o.is_ok())
        .map(|o| o.unwrap())
        .filter(|o| matches!(**o, XObject::Image(_)))
        );
        }
        //images.extend(resources.xobjects.iter().map(|(_name, &r)| file.get(r).unwrap())
        //    .filter(|o| matches!(**o, XObject::Image(_)))
        //);
    }

    for (i, o) in images.iter().enumerate() {
        let img = match **o {
            XObject::Image(ref im) => im,
            _ => continue
        };
        let (data, filter) = img.raw_image_data(&file)?;
        let ext = match filter {
            Some(StreamFilter::DCTDecode(_)) => "jpeg",
            Some(StreamFilter::JBIG2Decode) => "jbig2",
            Some(StreamFilter::JPXDecode) => "jp2k",
            _ => continue,
        };

        let fname = format!("extracted_image_{}.{}", i, ext);
        
        fs::write(fname.as_str(), data).unwrap();
        println!("Wrote file {}", fname);
    }
    println!("Found {} image(s).", images.len());


    for (name, font) in fonts.iter() {
        let fname = format!("font_{}", name);
        if let Some(Ok(data)) = font.embedded_data(&file) {
            fs::write(fname.as_str(), data).unwrap();
            println!("Wrote file {}", fname);
        }
    }
    println!("Found {} font(s).", fonts.len());

    if let Some(ref forms) = file.get_root().forms {
        println!("Forms:");
        for field in forms.fields.iter() {
            print!("  {:?} = ", field.name);
            match field.value {
                Primitive::String(ref s) => {
                    match s.to_string_lossy() {
                        Ok(s) => println!("{:?}", s),
                        Err(_) => println!("{:?}", s),
                    }
                }
                Primitive::Integer(i) => println!("{}", i),
                Primitive::Name(ref s) => println!("{}", s),
                ref p => println!("{:?}", p),
            }
        }
    }

    if let Ok(elapsed) = now.elapsed() {
        println!("Time: {}s", elapsed.as_secs() as f64
                 + elapsed.subsec_nanos() as f64 * 1e-9);
    }
    Ok(())
}

Full debug log

    Finished dev [unoptimized + debuginfo] target(s) in 0.01s
    Finished dev [unoptimized + debuginfo] target(s) in 0.01s
     Running `/pathToProjectgit/pdf-image-extractor/target/debug/pdf-image-extractor`
     Running `/pathToProjectgit/pdf-image-extractor/target/debug/pdf-image-extractor`
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value: Shared { source: Try { file: "/pathToProject.cargo/registry/src/github.com-1ecc6299db9ec823/pdf-0.8.0/src/file.rs", line: 87, column: 27, context: Context([]), source: UnspecifiedXRefEntry { id: 10 } } }', src/main.rs:39:18
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value: Shared { source: Try { file: "/pathToProject.cargo/registry/src/github.com-1ecc6299db9ec823/pdf-0.8.0/src/file.rs", line: 87, column: 27, context: Context([]), source: UnspecifiedXRefEntry { id: 10 } } }', src/main.rs:39:18
stack backtrace:
stack backtrace:
   0:     0x5601e498c08a -    0:     0x5601e498c08a - std::backtrace_rs::backtrace::libunwind::trace::hba70c054c9cdbd74
       std::backtrace_rs::backtrace::libunwind::trace::hba70c054c9cdbd74
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/../../backtrace/src/backtrace/libunwind.rs:                        at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/../../backtrace/src/backtrace/libunwind.rs:93:5
   1:    93:5
   1:     0x5601e498c08a - std::backtrace_rs::backtrace::trace_unsynchronized::hfff24a4d77b00fef
 0x5601e498c08a - std::backtrace_rs::backtrace::trace_unsynchronized::hfff24a4d77b00fef
                                                 at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5
   2:               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5
   2:     0x5601e498c08a - std::sys_common::backtrace::_print_fmt::h6fb3e9652d3b4f4e
   0x5601e498c08a - std::sys_common::backtrace::_print_fmt::h6fb3e9652d3b4f4e
                                                 at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/sys_common/backtrace.rs:65:5
   3:               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/sys_common/backtrace.rs:65:5
   3:     0x5601e498c08a - <std::sys_common::backtrace::_print   0x5601e498c08a - <std::sys_common::backtrace::_print::DisplayBacktrace as core::fmt::Display>::DisplayBacktrace as core::fmt::Display>::fmt::h254ba81a1e20fed0
           ::fmt::h254ba81a1e20fed0
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/sys_common/backtrace.rs:44:22
                    at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/sys_common/backtrace.rs:44:22
   4:     0x5601e49acc4e - core::   4:     0x5601e49acc4e - core::fmt::write::h232ccd94259bfe24
               fmt::write::h232ccd94259bfe24
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/fmt/mod.rs:1213:17
                at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/fmt/mod.rs:1213:17
   5:     0x5601e4989ef5 - std::io   5:     0x5601e4989ef5 - std::io::Write::write_fmt::h963cfaecfdd596f7
               ::Write::write_fmt::h963cfaecfdd596f7
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/io/mod.rs:1682:15
                at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/io/mod.rs:1682:15
   6:     0x5601e498be55 - std::   6:     0x5601e498be55 - std::sys_common::backtrace::_print::h6fbc4343523214ce
             sys_common::backtrace::_print::h6fbc4343523214ce
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/sys_common/backtrace.rs:47:5
                    at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/sys_common/backtrace.rs:47:5
   7:     0x5601e498be55 - std::sys_common::backtrace 7:     0x5601e498be55 - std::sys_common::backtrace::print::h55ab07cec21aacd5
            ::print::h55ab07cec21aacd5
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/sys_common/backtrace.rs:34:9
                   at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/sys_common/backtrace.rs:34:9
   8:     0x5601e498d63f - std::panicking   8:     0x5601e498d63f - std::panicking::default_hook::{{closure}}::hc10df65206eee69e
     ::default_hook::{{closure}}::hc10df65206eee69e
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:                          at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:267:22
267:22
   9:     0x5601e498d37b - std::   9:     0x5601e498d37b - std::panicking::default_hook::hdd684731d8d78925
                panicking::default_hook::hdd684731d8d78925
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:286:9
               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:286:9
  10:     0x5601e498dd49 - std::panicking  10:     0x5601e498dd49 - std::panicking::rust_panic_with_hook::h58681788b2d08dc0
                ::rust_panic_with_hook::h58681788b2d08dc0
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:688:13
               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:688:13
  11:     0x5601e498dae9 - std::  11:     0x5601e498dae9 - std::panicking::begin_panic_handler::{{closure}}::he6d9da406579493c
  panicking::begin_panic_handler::{{closure}}::he6d9da406579493c
                               at                              at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:579:13
  /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:579:13
  12:     0x5601e498c53c - std::sys_common::backtrace::12:     0x5601e498c53c - std::sys_common::backtrace::__rust_end_short_backtrace::h5b1f3b233c047d47
             __rust_end_short_backtrace::h5b1f3b233c047d47
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/sys_common/backtrace.rs:137:18
                  at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/sys_common/backtrace.rs:137:18
  13:     0x5601e498d7f2 - rust_begin_unwind  13:     0x5601e498d7f2 - rust_begin_unwind
                
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:575:5
               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:575:5
  14:     0x5601e4696783 -   14:     0x5601e4696783 - core::panicking::panic_fmt::hea602a2467b5109d
        core::panicking::panic_fmt::hea602a2467b5109d
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/panicking.rs:64:14
                       at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/panicking.rs:64:14
  15:       15:     0x5601e4696c33 - core::result::unwrap_failed::he3f6a4db4030a3f8
     0x5601e4696c33 - core::result::unwrap_failed::he3f6a4db4030a3f8
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/result.rs:                          at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/result.rs:1790:5
1790:5
  16:     0x5601e47ac882 - core::result::  16:     0x5601e47ac882 - core::result::Result<T,E>::unwrap::hd1617ae5f4a1be4c
          Result<T,E>::unwrap::hd1617ae5f4a1be4c
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/result.rs:1112:23
                     at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/result.rs:1112:23
  17:     0x5601e47c8eb8 - pdf_image_extractor::main::  17:     0x5601e47c8eb8 - pdf_image_extractor::main::h0dbf0546c6f40cd8
                               at /pathToProjectgit/pdf-image-extractor/src/main.rsh0dbf0546c6f40cd8
                               at /pathToProjectgit/pdf-image-extractor/src/main.rs:39:13
:39:13
  18:     0x5601e47041d2 -   18:     0x5601e47041d2 - core::ops::function::FnOnce::call_once::h1414c805c2cd5bad
       core::ops::function::FnOnce::call_once::h1414c805c2cd5bad
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/ops/function.rs:250                        at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/ops/function.rs:250:5
:5
  19:     0x5601e46e1ef5 -   19:     0x5601e46e1ef5 - std::sys_common::backtrace::__rust_begin_short_backtrace::h04581fca3248f830
          std::sys_common::backtrace::__rust_begin_short_backtrace::h04581fca3248f830
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/sys_common/backtrace.rs:121:18
                     at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/sys_common/backtrace.rs:121:18
  20:     0x5601e47405b6 -   20:     0x5601e47405b6 - std::rt::lang_start::{{closure}}::hb47817ff59d8bd3e
   std::rt::lang_start::{{closure}}::hb47817ff59d8bd3e
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/rt.rs                            at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/rt.rs:166:18
:166:18
  21:     0x5601e498651c -   21:     0x5601e498651c - core::ops::function::impls::<impl core::ops::function::FnOncecore::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once::h2dd1a24ae3e0569f
<A> for &F>::call_once::h2dd1a24ae3e0569f
                                                at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/ops/function.rs:287:13
  22:                 at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/core/src/ops/function.rs:287:13
  22:     0x5601e498651c - std::panicking::try::do_call::h71e38d3ed05d0919
  0x5601e498651c - std::panicking::try::do_call::h71e38d3ed05d0919
                                                 at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:483:40
  23:                 at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:483:40
  23:     0x5601e498651c - std::panicking::try::h9dd8fea17c119511
      0x5601e498651c - std::panicking::try::h9dd8fea17c119511
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:                          at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:447:19
  24:     0x5601e498651c - 447:19
  24:     0x5601e498651c - std::panic::catch_unwind::h073a10d358958706
        std::panic::catch_unwind::h073a10d358958706
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panic.rs:140:14
                       at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panic.rs:140:14
  25:     0x5601e498651c - std::rt  25:     0x5601e498651c - std::rt::lang_start_internal::{{closure}}::h0cf5d9b5652f6b98
   ::lang_start_internal::{{closure}}::h0cf5d9b5652f6b98
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/rt.rs                            at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/rt.rs:148:48
  26:     0x5601e498651c:148:48
  26:     0x5601e498651c - std::panicking::try::do_call::hc59ab1d339fa21e7
     - std::panicking::try::do_call::hc59ab1d339fa21e7
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:                           at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:483:40
  27:     0x5601e498651c - 483:40
  27:     0x5601e498651c - std::panicking::try::h40dd3124b394a6da
        std::panicking::try::h40dd3124b394a6da
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:447:19                       at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panicking.rs:447:19
  28:     0x5601e498651c - std::panic::catch_unwind::hff10c6c48e0fc17d
     
  28:     0x5601e498651c - std::panic::catch_unwind::hff10c6c48e0fc17d
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panic.rs:                          at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/panic.rs:140:14
  29:     0x5601e498651c - 140:14
  29:     0x5601e498651c - std::rt::lang_start_internal::h7868f0ffe3ad1ec2
        std::rt::lang_start_internal::h7868f0ffe3ad1ec2
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/rt.rs:148:20
                       at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/rt.rs:148:20
  30:     0x  30:     0x5601e474058a - std::rt::lang_start::h80b50788ba6911f9
      5601e474058a - std::rt::lang_start::h80b50788ba6911f9
                               at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/rt.rs:165                         at /rustc/8460ca823e8367a30dda430efda790588b8c84d3/library/std/src/rt.rs:165:17
:17
  31:     0x5601e47ca76e -   31:     0x5601e47ca76e - main
main
  32:     0x7f64dccdd790 -   32:     0x7f64dccdd790 - <unknown>
  33:     0x7f64dccdd84a - <unknown>
  33:     0x7f64dccdd84a - __libc_start_main
__libc_start_main
  34:     0x5601e4696e15 -   34:     0x5601e4696e15 - _start
  35:          _start
  35:                0x0 - <unknown>
      0x0 - <unknown>