tmdb-movie-webcrawler: A Go repository from RyanTokManMokMTM

TMDB Crawler With Concurrency

This application is used for fetching all movies info and crews info from TMDB api.

Clone the project to go/src/

git clone https://github.com/RyanTokManMokMTM/tmdb-movie-webcrawler.git

Usage:

NAME:
   TMDB Web Crawler - Fetch Movies and person etc...                     
                                                                         
USAGE:                                                                   
   main.exe [global options] command [command options] [arguments...]    
                                                                         
COMMANDS:                                                                
   help, h  Shows a list of commands or help for one command             
                                                                         
GLOBAL OPTIONS:                                                          
   --dbHost value                  Postgres DB Host IP(Default:127.0.0.1)
   --dbUser value, -u value        Postgres DB Username(Default:postgres)
   --dbPw value                    Postgres DB password(Default:null)    
   --db value                      Postgres DB database(Default:null)    
   --dbPort value, -p value        Postgres DB port(Default:5432)
   --moviePath value, --mf value   Data to store in(Default:null)
   --personPath value, --pf value  Data to store in(Default:null)
   --createTable value, -c value   Auto Creating the db Table(0:False,1:True)(Default:false)
   --help, -h                      show help

Example

go build main.go

./main --dbPw admin \
       --db TMDB \
       --moviePath D:/datas/movies  \
       --personPath D:/datas/persons \
       --createTable 1

Package

GzFileDownloader (Download jsonGz files from TMDB)

(Movies):http://files.tmdb.org/p/exports/movie_ids_MM_DD_YYYY.json.gz.json.gz (People):http://files.tmdb.org/p/exports/person_ids_MM_DD_YYYY.json.gz

JSON Structure

type TMDBJson struct {
	Id int `json:"id"`
}

Functions:

@Parms: url : a string of GzFile URL
func DownloadGZFile(url string) (*[]*TMDBJson,error)

webCrawler Fetch Movies Info and Related Persons Info from TMDB

Movie and Person Data

// Movies Sturct
type MovieInfo struct {
    Adult            bool    `json:"adult"`
    BackdropPath     string  `json:"backdrop_path"`
    GenreIds         []int   `json:"-" gorm:"-"` //we are going to store it with join table ,ignore that...
    Id               uint    `json:"id" gorm:"primarykey"`
    OriginalLanguage string  `json:"original_language"`
    OriginalTitle    string  `json:"original_title"`
    Overview         string  `json:"overview"`
    Popularity       float64 `json:"popularity"`
    PosterPath       string  `json:"poster_path"`
    ReleaseDate      string  `json:"release_date"`
    Title            string  `json:"title"`
    RunTime          int     `json:"runtime"`
    Video            bool    `json:"video"`
    VoteAverage      float64 `json:"vote_average"`
    VoteCount        int     `json:"vote_count"`
    
    VideoInfos VideoResults `json:"videos" gorm:"-"`
    
    ////gorm protocol
    //CreatedAt time.Time      `json:"-"`
    //UpdatedAt time.Time      `json:"-"`
    //DeletedAt gorm.DeletedAt `gorm:"index" json:"-"`
    
    //Here have many2many relationship
    //one movie can have many genres
    //a genres can belong to many result
    
    GenreInfo  []GenreInfo      `json:"genres" gorm:"many2many:genres_movies"` //json do not contain this info, ignore that
    MovieVideo []MovieVideoInfo `json:"-" gorm:"foreignKey:MovieID"`
}

//PersonInfo Struct
type PersonInfo struct {
	Adult  bool `json:"adult"`
	//also known as???
	Gender int  `json:"gender"` //1 or 2
	Id     uint `json:"id" gorm:"primarykey"`

	Department string  `json:"known_for_department"`
	Name               string  `json:"name"`
	Popularity         float64 `json:"popularity"`
	ProfilePath        string  `json:"profile_path"`
	//
	//CreatedAt time.Time      `json:"-"`
	//UpdatedAt time.Time      `json:"-"`
	//DeletedAt gorm.DeletedAt `gorm:"index" json:"-"`

	//json only
	MovieCredits movieCreditAPIData `json:"movie_credits" gorm:"-"`
	//People has many movie character
	MovieCharacter []MovieCharacter `json:"-" gorm:"foreignKey:PersonID"`
	PersonCrew []PersonCrew `json:"-" gorm:"foreignKey:PersonID"`
}

Functions:

All Crawlering using Concurrcy to improve the performance

Movie Fetcher

@Parms: ids : a list of movie ids
@Parms: moviePath : json data to store at some location
func FetchMovieInfosViaIDS(ids []int,moviePath string)

Person Fetcher

@Parms: ids : a list of person ids
@Parms: personPath : json data to store at some location
func FetchPersonInfosViaIDS(ids []int,personPath string)

Main Procedure

Step1: Get all available from TMDB
Step2: Get all movies id from TMDB JSON
Step3: Get all person id from TMDB JSON
Step4: API crawling....(Movies(60w+ datas),Persons(200w+Datas))-> need around 1 hour~2hour
YOU CAN SKIP THE STOP BELOW ,IF YOU NOT NEED)
Step5: Create Database table
Step6: Insert all movies and persons to db
Step7: Done....

example:

var (
    sqlHOST string = "127.0.0.1"
    userName string = "postgres"
    password string = ""
    port int = 5432
    db string = "TMDB"
    moviePath string = ""
    PersonPath string = ""
    migration bool = false
)

func main(){
    readArgc()
    if PersonPath == "" || moviePath == ""{
    log.Fatalln("FilePath can't be empty")
    }
    
    log.Println("Configuring the database...")
    config := dbConfigure()
    db, err := gorm.Open(postgres.Open(config),&gorm.Config{
    })
	
    if err != nil {
        log.Println(err)
        return
    }
    log.Println("DB Configuration Done...")
    
    if migration {
        log.Println("Creating table...")
        db.AutoMigrate(&webCrawler.GenreInfo{})
        db.AutoMigrate(&webCrawler.MovieInfo{})
        db.AutoMigrate(&webCrawler.GenresMovies{})
        db.AutoMigrate(&webCrawler.PersonInfo{})
        db.AutoMigrate(&webCrawler.MovieCharacter{})
        db.AutoMigrate(&webCrawler.PersonCrew{})
        
        if err := db.Exec("ALTER TABLE genres_movies DROP CONSTRAINT genres_movies_pkey").Error ; err != nil {
            log.Println(err)
            return
        }
    
    if err := db.Exec("ALTER TABLE genres_movies ADD CONSTRAINT  genres_movies_unique UNIQUE(genre_info_id,movie_info_id)").Error; err != nil{
        log.Println(err)
        return
	}
    
    if err := db.Exec("ALTER TABLE genres_movies ADD CONSTRAINT genres_movies_pkey PRIMARY KEY (id)").Error ; err != nil{
        log.Println(err)
        return
    }
    
    }
    //TODO - Get Genre And Movie
    movieCrawlerProcedure(db)
    //
    ////TODO - Get ALL person
    personCrawlerProcedure(db)

}