wikidump
is a small package for reading bz2
compressed Wikipedia dump XML
files (e.g. enwiki-latest-pages-articles.xml.bz2
from
https://dumps.wikimedia.org/enwiki/latest/):
package main
import (
"fmt"
"github.com/msrocka/wikidump"
)
func main() {
dump := "path/to/dump.bz2"
reader, err := wikidump.NewReader(dump)
if err != nil {
panic(err)
}
defer reader.Close()
// print all page titles in the wikidump
err = reader.Read(func(p *wikidump.Page) bool {
fmt.Println(p.Title)
return true // return true to continue
})
if err != nil {
panic(err)
}
}