mkenney/go-chrome

How to get the raw HTML of a page

tamoyal opened this issue · 6 comments

What version of go-chrome are you using (tag, hash, etc.)?

32dfd32

Issue

I understand how to get the DOM as an object but using a similar methodology to get the raw html is not working. This is the code:

var err error

	// chrome_path := "/usr/bin/google-chrome"
	chrome_path := "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"

	// Define a chrome instance with remote debugging enabled.
	browser := chrome.New(
		// See https://developers.google.com/web/updates/2017/04/headless-chrome#cli
		// for details about startup flags
		&chrome.Flags{
			"addr":               "localhost",
			"disable-extensions": nil,
			"disable-gpu":        nil,
			"headless":           true,
			"hide-scrollbars":    nil,
			"no-first-run":       nil,
			"no-sandbox":         nil,
			"port":               9222,
			"remote-debugging-address": "0.0.0.0",
			"remote-debugging-port":    9222,
		},
		chrome_path, // Path to Chromeium binary
		"/tmp",      // Set the Chromium working directory
		"/dev/null", // Ignore internal Chromium output, set to empty string for os.Stdout
		"/dev/null", // Ignore internal Chromium errors, set to empty string for os.Stderr
	)

	// Start the chrome process.
	if err := browser.Launch(); nil != err {
		panic(err)
	}

	// Open a tab and navigate to the URL you want to screenshot.
	tab, err := browser.NewTab("http://www.brainjar.com/java/host/test.html")
	if nil != err {
		panic(err)
	}

	// Enable Page events for this tab.
	if enableResult := <-tab.Page().Enable(); nil != enableResult.Err {
		panic(enableResult.Err)
	}

	// Enable the DOM agent for this tab.
	if enableResult := <-tab.DOM().Enable(); nil != enableResult.Err {
		panic(enableResult.Err)
	}

	// Create a channel to receive the DOM data.
	outer_html_chan := make(chan *dom.GetOuterHTMLResult)

	// When the page load event fires, deliver the root DOM node.
	tab.Page().OnLoadEventFired(func(event *page.LoadEventFiredEvent) {
		params := &dom.GetOuterHTMLParams{
			NodeID:        dom.NodeID(1),
			// BackendNodeID: dom.BackendNodeID(1),
			// ObjectID:      runtime.RemoteObjectID("remote-object-id"),
		}
		outer_html_chan <- <-tab.DOM().GetOuterHTML(params)
	})

	result := <-outer_html_chan
	tmp, _ := json.MarshalIndent(result, "", "    ")
	fmt.Printf("%s\n\n", string(tmp))

Which prints:

{
    "outerHTML": ""
}

The Google docs aren't helping me understand what's going on behind the scenes that much so any help would be appreciated. Thanks!

Hi @tamoyal,

I agree, the docs aren't the greatest. The issue is that devtools hasn't parsed the DOM document yet. If you force chrome to evaluate it then the HTML will be available:

tab.DOM().GetDocument(&dom.GetDocumentParams{})

Then your OuterHTML calls will work, here's a working example:

	// When the page load event fires, deliver the root DOM node.
	tab.Page().OnLoadEventFired(func(event *page.LoadEventFiredEvent) {
		tab.DOM().GetDocument(&dom.GetDocumentParams{})
		outer_html_chan <- <-tab.DOM().GetOuterHTML(&dom.GetOuterHTMLParams{
			NodeID: dom.NodeID(1),
		})
	})

Which returns

{
    "outerHTML": "\u003c!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"\u003e\u003chtml\u003e\u003chead\u003e\n\u003ctitle\u003eTest HTML File\u003c/title\u003e\n\u003cmeta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"\u003e\n\u003c/head\u003e\n\u003cbody\u003e\n\n\u003cp\u003eThis is a very simple HTML file.\u003c/p\u003e\n\n\n\n\u003c/body\u003e\u003c/html\u003e"
}

GetDocument (and GetFlattenedDocument) also returns a data structure you can use to iterate the nodes and select a specific one, or you can use the QuerySelector* methods to search for a specific node. For example

	// When the page load event fires, deliver the root DOM node.
	tab.Page().OnLoadEventFired(func(event *page.LoadEventFiredEvent) {
		document := <-tab.DOM().GetDocument(&dom.GetDocumentParams{Depth: -1})
		if document.Root.ChildNodeCount >= 2 {
			for _, child := range document.Root.Children[1].Children {
				if child.NodeName == "BODY" {
					outer_html_chan <- <-tab.DOM().GetOuterHTML(&dom.GetOuterHTMLParams{
						NodeID: child.NodeID,
					})
				}
			}
		}
	})

Which returns

{
    "outerHTML": "\u003cbody\u003e\n\n\u003cp\u003eThis is a very simple HTML file.\u003c/p\u003e\n\n\n\n\u003c/body\u003e"
}

The full GetDocument result for that page (as JSON) looks like

{
    "nodeId": 1,
    "backendNodeId": 2,
    "nodeType": 9,
    "nodeName": "#document",
    "localName": "",
    "nodeValue": "",
    "childNodeCount": 2,
    "children": [
        {
            "nodeId": 2,
            "parentId": 1,
            "backendNodeId": 3,
            "nodeType": 10,
            "nodeName": "html",
            "localName": "",
            "nodeValue": "",
            "publicId": "-//W3C//DTD XHTML 1.0 Strict//EN",
            "systemId": "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
        },
        {
            "nodeId": 3,
            "parentId": 1,
            "backendNodeId": 4,
            "nodeType": 1,
            "nodeName": "HTML",
            "localName": "html",
            "nodeValue": "",
            "childNodeCount": 2,
            "children": [
                {
                    "nodeId": 4,
                    "parentId": 3,
                    "backendNodeId": 5,
                    "nodeType": 1,
                    "nodeName": "HEAD",
                    "localName": "head",
                    "nodeValue": "",
                    "childNodeCount": 2,
                    "children": [
                        {
                            "nodeId": 5,
                            "parentId": 4,
                            "backendNodeId": 6,
                            "nodeType": 1,
                            "nodeName": "TITLE",
                            "localName": "title",
                            "nodeValue": "",
                            "childNodeCount": 1,
                            "children": [
                                {
                                    "nodeId": 6,
                                    "parentId": 5,
                                    "backendNodeId": 7,
                                    "nodeType": 3,
                                    "nodeName": "#text",
                                    "localName": "",
                                    "nodeValue": "Test HTML File"
                                }
                            ]
                        },
                        {
                            "nodeId": 7,
                            "parentId": 4,
                            "backendNodeId": 8,
                            "nodeType": 1,
                            "nodeName": "META",
                            "localName": "meta",
                            "nodeValue": "",
                            "attributes": [
                                "http-equiv",
                                "Content-Type",
                                "content",
                                "text/html;charset=utf-8"
                            ]
                        }
                    ]
                },
                {
                    "nodeId": 8,
                    "parentId": 3,
                    "backendNodeId": 9,
                    "nodeType": 1,
                    "nodeName": "BODY",
                    "localName": "body",
                    "nodeValue": "",
                    "childNodeCount": 1,
                    "children": [
                        {
                            "nodeId": 9,
                            "parentId": 8,
                            "backendNodeId": 10,
                            "nodeType": 1,
                            "nodeName": "P",
                            "localName": "p",
                            "nodeValue": "",
                            "childNodeCount": 1,
                            "children": [
                                {
                                    "nodeId": 10,
                                    "parentId": 9,
                                    "backendNodeId": 11,
                                    "nodeType": 3,
                                    "nodeName": "#text",
                                    "localName": "",
                                    "nodeValue": "This is a very simple HTML file."
                                }
                            ]
                        }
                    ]
                }
            ],
            "frameId": "75F2C00830FBD2C19FB56B3941C97E4D"
        }
    ],
    "documentURL": "http://www.brainjar.com/java/host/test.html",
    "baseURL": "http://www.brainjar.com/java/host/test.html"
}

Let me know if that helps.

Thank you @mkenney. I just want the full raw html. The example you gave gives me &{ code=-32000, data=, msg=Could not find node with given id} so I'm wondering ....must I know something about the DOM before I just ask for the full raw HTML?

No, you don't need to know anything about the DOM necessarily, node ID 1 will work, but chrome has to parse the DOM first. There may also be a race condition. Because the page is so small or if chrome is loading from the cache, it's possible that the pageLoadEventFired event is firing in the browser before the websocket connection can be established. I didn't see that issue when loading https://www.google.com/ for example.

Here's a version of the script that is slightly optimized for looking at the DOM. It's using the domContentEventFired event instead which should fire as soon as the DOM is ready to be parsed ( (before the page load event, which probably makes the race condition worse actually since that event fires even earlier than the domPageLoadEventFired event) and registers that event listener earlier. I also added a 2 second timeout that will parse the DOM and get the source if the domContentEventFired event is missed. You'll see that on that page sometimes the DOM content event never fires and the timeout captures the source.

I've been thinking about adding an optional set of callbacks when opening a tab or navigating for registering event listeners or whatever immediately after the tab becomes available, but due to the network bottleneck and the way chrome registers websockets there will always be a gap there. I'm likely missing something and it might make more sense to establish the socket connection before navigating to a new page, but I'll need to do more investigation to see if that works.

Anyway, here's the full script thats working for me:

package main

import (
	"encoding/json"
	"fmt"
	"os"
	"time"

	chrome "github.com/mkenney/go-chrome/tot"
	"github.com/mkenney/go-chrome/tot/dom"
	"github.com/mkenney/go-chrome/tot/page"
)

func main() {
	var err error
	outer_html_chan := make(chan *dom.GetOuterHTMLResult)
	var document *dom.GetDocumentResult

	//chrome_path := "/usr/bin/google-chrome"
	chrome_path := "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"

	// Define a chrome instance with remote debugging enabled.
	browser := chrome.New(
		// See https://developers.google.com/web/updates/2017/04/headless-chrome#cli
		// for details about startup flags
		&chrome.Flags{
			"addr":               "localhost",
			"disable-extensions": nil,
			"disable-gpu":        nil,
			"headless":           true,
			"hide-scrollbars":    nil,
			"no-first-run":       nil,
			"no-sandbox":         nil,
			"port":               9222,
			"remote-debugging-address": "0.0.0.0",
			"remote-debugging-port":    9222,
		},
		chrome_path, // Path to Chromeium binary
		"/tmp",      // Set the Chromium working directory
		"/dev/null", // Ignore internal Chromium output, set to empty string for os.Stdout
		"/dev/null", // Ignore internal Chromium errors, set to empty string for os.Stderr
	)

	// Start the chrome process.
	if err := browser.Launch(); nil != err {
		panic(err)
	}

	// Open a tab and navigate to the URL you want to screenshot.
	tab, err := browser.NewTab("http://www.brainjar.com/java/host/test.html")
	if nil != err {
		panic(err)
	}

	// When the page load event fires, deliver the root DOM node.
	tab.Page().OnDOMContentEventFired(func(event *page.DOMContentEventFiredEvent) {
		document := <-tab.DOM().GetDocument(&dom.GetDocumentParams{Depth: -1})
		outer_html_chan <- <-tab.DOM().GetOuterHTML(&dom.GetOuterHTMLParams{
			NodeID: document.Root.NodeID,
		})
	})

	// Enable the DOM agent for this tab.
	if enableResult := <-tab.DOM().Enable(); nil != enableResult.Err {
		panic(enableResult.Err)
	}

	var result *dom.GetOuterHTMLResult
	select {
	case result = <-outer_html_chan:
	case <-time.After(2 * time.Second):
		fmt.Println("timeout elapsed, requesting dom")
		document = <-tab.DOM().GetDocument(&dom.GetDocumentParams{Depth: -1})
		result = <-tab.DOM().GetOuterHTML(&dom.GetOuterHTMLParams{
			NodeID: document.Root.NodeID,
		})
	}
	tmp, _ := json.MarshalIndent(result, "", "    ")
	fmt.Printf("%s\n\n", string(tmp))
}

Makes sense regarding the socket connection. If it's helpful to your investigation, you may want to checkout the internals of https://github.com/mafredri/cdp as it does not require a timeout like you have setup above. Thanks for following up

Thanks, I'll take a look at what that project is doing.