/webarchive

golang readers for ARC and WARC webarchive formats

Primary LanguageGoApache License 2.0Apache-2.0

A reader for the WARC and ARC web archive formats.

Note: This package has been written for use in https://github.com/richardlehane/siegfried and has a bunch of quirks relating to that use case. If you're after a general purpose golang WARC package, you might be better suited by one of these excellent choices:

Example usage:

f, _ := os.Open("examples/IAH-20080430204825-00000-blackbook.arc")
// NewReader(io.Reader) can be used to read WARC, ARC or gzipped WARC or ARC files
rdr, err := webarchive.NewReader(f)
if err != nil {
  log.Fatal(err)
}
// use Next() to iterate through all records in the WARC or ARC file
for record, err := rdr.Next(); err == nil; record, err = rdr.Next() {
  // records implement the io.Reader interface
  i, err := io.Copy(ioutil.Discard, record)
  if err != nil {
    log.Fatal(err)
  }
  fmt.Printf("Read: %d bytes\n", i)
  // records also have URL(), MIME(), Date() and Size() methods
  fmt.Printf("URL: %s, MIME: %s, Date: %v, Size: %d\n", 
    record.URL(), record.MIME(), record.Date(), record.Size())
  // the Fields() method returns all the fields in the WARC or ARC record
  for key, values := range record.Fields() {
    fmt.Printf("Field key: %s, Field values: %v\n", key, values)
  }
}
f.Close()
f, _ = os.Open("examples/IAH-20080430204825-00000-blackbook.warc.gz")
defer f.Close()
// readers can Reset() to reuse the underlying buffers
err = rdr.Reset(f)
// the Close() method should be used if you pass in gzipped files, it is a nop for 
// non-gzipped files
defer rdr.Close()
// NextPayload() skips non-resource, conversion or response records and merges 
// continuations into single records. It also strips HTTP headers from response 
// records. After stripping, those HTTP headers are available alongside the WARC 
// headers in the record.Fields() map.
for record, err := rdr.NextPayload(); err == nil; record, err = rdr.NextPayload() {
  // webarchive.DecodePayload(record) decodes any encodings (transfer or 
  // content) declared in a record's HTTP header.
  // webarchive.DecodePayloadT(record) just decodes transfer encodings.
  // Both decode chunked, deflate and gzip encodings.
  record = webarchive.DecodePayload(record)
  i, err := io.Copy(ioutil.Discard, record)
  if err != nil {
    log.Fatal(err)
  }
  fmt.Printf("Read: %d bytes\n", i)
  // any skipped HTTP headers can be retrieved from the Fields() map
  for key, values := range record.Fields() {
    fmt.Printf("Field key: %s, Field values: %v\n", key, values)
  }
}

Install with go get github.com/richardlehane/webarchive

GoDoc