Unable to Extract Images from Docx File

Question

Unable to Extract Images from Docx File

JNimkarLS opened this issue 2 years ago · 7 comments

Description

I have a docx file that contains a single image. Here's the code I am using to pull out the images:

	reader := bytes.NewReader(content)
	doc, err := presentation.Read(reader, reader.Size())
	if err != nil {
		return "", nil, fmt.Errorf("presentation read failure with error: %v", err)
	}
	if doc == nil {
		return "", nil, fmt.Errorf("internal error: [presentation.Read] returned a nil pointer")
	}
	defer doc.Close()

	for _, img := range doc.Images {
		if img.Path() == "" {
			ctx.Logger().Warn("received an image with an empty path")
			continue
		}
		data, err := os.ReadFile(img.Path())
		if err != nil {
			ctx.Logger().Error("failed to read file: %s with error: %v", img.Path(), err)
			continue
		}
	}
	extracted := doc.ExtractText()

Expected Behavior

For the file I've attached below, it contains one image. However, the length of doc.Images is 0 and it should be a length of 1. As a note, I have created this document on Microsoft OneDrive and downloaded it from OneDrive to share with you.

Actual Behavior

Length of doc.Images is 0 and does not enter for loop
J1.docx

Please include a reproducible code snippet or document attachment that
demonstrates the issue.

Answer 1 · 2022-08-09T07:44:55.000Z

Hi @JNimkarLS,

I tried to extract image for the J1.docx, it is successfully extracted using this code below.

// Copyright 2022 FoxyUtils ehf. All rights reserved.
package main

import (
	"io"
	"log"
	"os"
	"strconv"

	"github.com/unidoc/unioffice/common/license"
	"github.com/unidoc/unioffice/document"
)

func init() {
	// Make sure to load your metered License API key prior to using the library.
	// If you need a key, you can sign up and create a free one at https://cloud.unidoc.io
	err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`))
	if err != nil {
		panic(err)
	}
}

func main() {
	doc, err := document.Open("J1.docx")
	if err != nil {
		panic(err)
	}
	defer doc.Close()

	for i, img := range doc.Images {
		destImg := strconv.Itoa(i) + "." + img.Format()
		if err := extractImgFile(img.Path(), destImg); err != nil {
			panic(err)
		}

		// Incase you want to read the bytes file.
		imgBytes, err := os.ReadFile(img.Path())
		if err != nil {
			panic(err)
		}

		log.Printf("bytes: %v\n", imgBytes)
	}
}

func extractImgFile(src, dst string) error {
	in, err := os.Open(src)
	if err != nil {
		return err
	}
	defer in.Close()

	out, err := os.Create(dst)
	if err != nil {
		return err
	}
	defer out.Close()

	_, err = io.Copy(out, in)

	return err
}

Is the file is correct? I saw you code, it is trying to open presentation file (PPTX) presentation.Read().

Answer 2 · 2022-08-09T15:08:08.000Z

@sampila Thanks for the quick response. My apologies - I am actually using document.Read(). I accidentally sent the snippet that is scanning presentations. However, the code between the two is the same. Here is the code:

	doc, err := document.Read(reader, reader.Size()) // <--- document comes in as a byte array
	if err != nil {
		return "", nil, fmt.Errorf("document read failure with error: %v", err)
	}
	if doc == nil {
		return "", nil, fmt.Errorf("internal error: [document.Read] returned a nil pointer")
	}
	defer doc.Close()


	for _, img := range doc.Images {
		ctx.Logger().Debug("image found in docx file - scanning image")
		if img.Path() == "" {
			ctx.Logger().Warn("received an image with an empty path")
			continue
		}
		raw, err := os.ReadFile(img.Path())
		if err != nil {
			ctx.Logger().Error("failed to read file: %s with error: %v", img.Path(), err)
			continue
		}
	}

The issue I am having is that the service receives the MS word document as a byte array and the docx file is not stored locally. So I must use document.Read() and not document.Open(). Can you reproduce the error with this code?

Answer 3 · 2022-08-10T14:20:36.000Z

Hi @JNimkarLS,

I tried to modify the code to use document.Read() and extract the image, here's my code.

// Copyright 2022 FoxyUtils ehf. All rights reserved.
package main

import (
	"io"
	"log"
	"os"
	"strconv"

	"github.com/unidoc/unioffice/common/license"
	"github.com/unidoc/unioffice/document"
)

func init() {
	// Make sure to load your metered License API key prior to using the library.
	// If you need a key, you can sign up and create a free one at https://cloud.unidoc.io
	err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`))
	if err != nil {
		panic(err)
	}
}

func main() {
	filename := "J1.docx"

	docFile, err := os.Open(filename)
	if err != nil {
		panic(err)
	}
	defer docFile.Close()

	docFileInfo, err := os.Stat(filename)
	if err != nil {
		panic(err)
	}

	doc, err := document.Read(docFile, docFileInfo.Size()) // <--- document comes in as a byte array
	if err != nil {
		panic(err)
	}
	if doc == nil {
		panic(err)
	}
	defer doc.Close()

	for i, img := range doc.Images {
		destImg := strconv.Itoa(i) + "." + img.Format()
		if err := extractImgFile(img.Path(), destImg); err != nil {
			panic(err)
		}

		// Incase you want to read the bytes file.
		imgBytes, err := os.ReadFile(img.Path())
		if err != nil {
			panic(err)
		}

		log.Printf("bytes: %v\n", imgBytes)
	}
}

func extractImgFile(src, dst string) error {
	in, err := os.Open(src)
	if err != nil {
		return err
	}
	defer in.Close()

	out, err := os.Create(dst)
	if err != nil {
		return err
	}
	defer out.Close()

	_, err = io.Copy(out, in)

	return err
}

Able to extract the image from J1.docx without issue.

Answer 4 · 2022-08-10T16:55:44.000Z

@sampila Ok interesting. So why doesn't a solution like this work:

func main() {
	content, err := os.ReadFile(documentFilePath) // <-- path to J1.docx
	if err != nil {
		panic(err)
	}
	reader := bytes.NewReader(content)
	doc, err := document.Read(reader, reader.Size())
	if err != nil {
		panic(fmt.Errorf("document read failure with error: %v", err))
	}

	fmt.Println(len(doc.Images))
}

If I print the length of images for J1.docx it gives me 0.

Answer 5 · 2022-08-10T17:36:05.000Z

Hi @JNimkarLS,

I tested your latest code, working fine on my end and able to extract the image.

// Copyright 2022 FoxyUtils ehf. All rights reserved.
package main

import (
	"bytes"
	"fmt"
	"io"
	"os"
	"strconv"

	"github.com/unidoc/unioffice/common/license"
	"github.com/unidoc/unioffice/document"
)

func init() {
	// Make sure to load your metered License API key prior to using the library.
	// If you need a key, you can sign up and create a free one at https://cloud.unidoc.io
	err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`))
	if err != nil {
		panic(err)
	}
}

func main() {
	filename := "J1.docx"

	content, err := os.ReadFile(filename) // <-- path to J1.docx
	if err != nil {
		panic(err)
	}
	reader := bytes.NewReader(content)

	doc, err := document.Read(reader, reader.Size())
	if err != nil {
		panic(fmt.Errorf("document read failure with error: %v", err))
	}
	fmt.Println(len(doc.Images))

	if doc == nil {
		panic(err)
	}
	defer doc.Close()

	for i, img := range doc.Images {
		destImg := strconv.Itoa(i) + "." + img.Format()
		if err := extractImgFile(img.Path(), destImg); err != nil {
			panic(err)
		}

		// Incase you want to read the bytes file.
		_, err := os.ReadFile(img.Path())
		if err != nil {
			panic(err)
		}

		//log.Printf("bytes: %v\n", imgBytes)
	}
}

func extractImgFile(src, dst string) error {
	in, err := os.Open(src)
	if err != nil {
		return err
	}
	defer in.Close()

	out, err := os.Create(dst)
	if err != nil {
		return err
	}
	defer out.Close()

	_, err = io.Copy(out, in)

	return err
}

Answer 6 · 2022-08-14T21:07:15.000Z

Hi @JNimkarLS,

Have you checked the code and file?
Do the code works on your end?

Answer 7 · 2022-09-17T11:33:53.000Z

Hi @JNimkarLS,

We closing this issue for now, feel free to re-open this issue if you this still not resolved.

Best regards,
Alip