This is the project for crawling CNN news articles and comments from Disqus.
NOTE: ',' in each Format means '\t' in the file.
Format: {Author_ID, Author_Name, Author_Title, Article_Counts}
FilesName: "Dict_Authors"
Format: {Art_ID, Art_Topic, Art_Title, Date, Timestamp, URL, Text}
FileName: "Dict_Articles"
Format: {Author_ID, Art_ID}
FileName: "Dict_Author_Article"
Format: {Art_ID, Comment_ID}
FileName: "Dict_Article_Comment"
Format: {ID, User_ID, parent(reply_to), createdAt, likes, dislikes, message}
FileName: "Dict_Comments"
Format: {User_ID, User_Name, Name, joinedAt, reputation}
FileName: "Dict_Users"
Format: {TimeStamp, TotalCount, NewsIDs}
FileName: "news.log"
Format: {NewsID, TotalCount, StartTime, EndTime}
FileName: "comments.log"