Unable to Extract Images from Docx File
JNimkarLS opened this issue · 7 comments
Description
I have a docx file that contains a single image. Here's the code I am using to pull out the images:
reader := bytes.NewReader(content)
doc, err := presentation.Read(reader, reader.Size())
if err != nil {
return "", nil, fmt.Errorf("presentation read failure with error: %v", err)
}
if doc == nil {
return "", nil, fmt.Errorf("internal error: [presentation.Read] returned a nil pointer")
}
defer doc.Close()
for _, img := range doc.Images {
if img.Path() == "" {
ctx.Logger().Warn("received an image with an empty path")
continue
}
data, err := os.ReadFile(img.Path())
if err != nil {
ctx.Logger().Error("failed to read file: %s with error: %v", img.Path(), err)
continue
}
}
extracted := doc.ExtractText()
Expected Behavior
For the file I've attached below, it contains one image. However, the length of doc.Images is 0 and it should be a length of 1. As a note, I have created this document on Microsoft OneDrive and downloaded it from OneDrive to share with you.
Actual Behavior
Length of doc.Images is 0 and does not enter for loop
J1.docx
Please include a reproducible code snippet or document attachment that
demonstrates the issue.
Hi @JNimkarLS,
I tried to extract image for the J1.docx
, it is successfully extracted using this code below.
// Copyright 2022 FoxyUtils ehf. All rights reserved.
package main
import (
"io"
"log"
"os"
"strconv"
"github.com/unidoc/unioffice/common/license"
"github.com/unidoc/unioffice/document"
)
func init() {
// Make sure to load your metered License API key prior to using the library.
// If you need a key, you can sign up and create a free one at https://cloud.unidoc.io
err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`))
if err != nil {
panic(err)
}
}
func main() {
doc, err := document.Open("J1.docx")
if err != nil {
panic(err)
}
defer doc.Close()
for i, img := range doc.Images {
destImg := strconv.Itoa(i) + "." + img.Format()
if err := extractImgFile(img.Path(), destImg); err != nil {
panic(err)
}
// Incase you want to read the bytes file.
imgBytes, err := os.ReadFile(img.Path())
if err != nil {
panic(err)
}
log.Printf("bytes: %v\n", imgBytes)
}
}
func extractImgFile(src, dst string) error {
in, err := os.Open(src)
if err != nil {
return err
}
defer in.Close()
out, err := os.Create(dst)
if err != nil {
return err
}
defer out.Close()
_, err = io.Copy(out, in)
return err
}
Is the file is correct? I saw you code, it is trying to open presentation file (PPTX) presentation.Read()
.
@sampila Thanks for the quick response. My apologies - I am actually using document.Read(). I accidentally sent the snippet that is scanning presentations. However, the code between the two is the same. Here is the code:
doc, err := document.Read(reader, reader.Size()) // <--- document comes in as a byte array
if err != nil {
return "", nil, fmt.Errorf("document read failure with error: %v", err)
}
if doc == nil {
return "", nil, fmt.Errorf("internal error: [document.Read] returned a nil pointer")
}
defer doc.Close()
for _, img := range doc.Images {
ctx.Logger().Debug("image found in docx file - scanning image")
if img.Path() == "" {
ctx.Logger().Warn("received an image with an empty path")
continue
}
raw, err := os.ReadFile(img.Path())
if err != nil {
ctx.Logger().Error("failed to read file: %s with error: %v", img.Path(), err)
continue
}
}
The issue I am having is that the service receives the MS word document as a byte array and the docx file is not stored locally. So I must use document.Read() and not document.Open(). Can you reproduce the error with this code?
Hi @JNimkarLS,
I tried to modify the code to use document.Read() and extract the image, here's my code.
// Copyright 2022 FoxyUtils ehf. All rights reserved.
package main
import (
"io"
"log"
"os"
"strconv"
"github.com/unidoc/unioffice/common/license"
"github.com/unidoc/unioffice/document"
)
func init() {
// Make sure to load your metered License API key prior to using the library.
// If you need a key, you can sign up and create a free one at https://cloud.unidoc.io
err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`))
if err != nil {
panic(err)
}
}
func main() {
filename := "J1.docx"
docFile, err := os.Open(filename)
if err != nil {
panic(err)
}
defer docFile.Close()
docFileInfo, err := os.Stat(filename)
if err != nil {
panic(err)
}
doc, err := document.Read(docFile, docFileInfo.Size()) // <--- document comes in as a byte array
if err != nil {
panic(err)
}
if doc == nil {
panic(err)
}
defer doc.Close()
for i, img := range doc.Images {
destImg := strconv.Itoa(i) + "." + img.Format()
if err := extractImgFile(img.Path(), destImg); err != nil {
panic(err)
}
// Incase you want to read the bytes file.
imgBytes, err := os.ReadFile(img.Path())
if err != nil {
panic(err)
}
log.Printf("bytes: %v\n", imgBytes)
}
}
func extractImgFile(src, dst string) error {
in, err := os.Open(src)
if err != nil {
return err
}
defer in.Close()
out, err := os.Create(dst)
if err != nil {
return err
}
defer out.Close()
_, err = io.Copy(out, in)
return err
}
Able to extract the image from J1.docx
without issue.
@sampila Ok interesting. So why doesn't a solution like this work:
func main() {
content, err := os.ReadFile(documentFilePath) // <-- path to J1.docx
if err != nil {
panic(err)
}
reader := bytes.NewReader(content)
doc, err := document.Read(reader, reader.Size())
if err != nil {
panic(fmt.Errorf("document read failure with error: %v", err))
}
fmt.Println(len(doc.Images))
}
If I print the length of images for J1.docx it gives me 0.
Hi @JNimkarLS,
I tested your latest code, working fine on my end and able to extract the image.
// Copyright 2022 FoxyUtils ehf. All rights reserved.
package main
import (
"bytes"
"fmt"
"io"
"os"
"strconv"
"github.com/unidoc/unioffice/common/license"
"github.com/unidoc/unioffice/document"
)
func init() {
// Make sure to load your metered License API key prior to using the library.
// If you need a key, you can sign up and create a free one at https://cloud.unidoc.io
err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`))
if err != nil {
panic(err)
}
}
func main() {
filename := "J1.docx"
content, err := os.ReadFile(filename) // <-- path to J1.docx
if err != nil {
panic(err)
}
reader := bytes.NewReader(content)
doc, err := document.Read(reader, reader.Size())
if err != nil {
panic(fmt.Errorf("document read failure with error: %v", err))
}
fmt.Println(len(doc.Images))
if doc == nil {
panic(err)
}
defer doc.Close()
for i, img := range doc.Images {
destImg := strconv.Itoa(i) + "." + img.Format()
if err := extractImgFile(img.Path(), destImg); err != nil {
panic(err)
}
// Incase you want to read the bytes file.
_, err := os.ReadFile(img.Path())
if err != nil {
panic(err)
}
//log.Printf("bytes: %v\n", imgBytes)
}
}
func extractImgFile(src, dst string) error {
in, err := os.Open(src)
if err != nil {
return err
}
defer in.Close()
out, err := os.Create(dst)
if err != nil {
return err
}
defer out.Close()
_, err = io.Copy(out, in)
return err
}
Hi @JNimkarLS,
Have you checked the code and file?
Do the code works on your end?
Hi @JNimkarLS,
We closing this issue for now, feel free to re-open this issue if you this still not resolved.
Best regards,
Alip