
Ptt Pages and Posts => json

Primary LanguageJupyter Notebook


A simple python library to extract pages and posts information from https://www.ptt.cc/bbs/ to json format

>>> from ptt2json import *
>>> ptt = PttPage(boardname="Gossiping")
>>> print(ptt.posts)

[{'url': '/bbs/Gossiping/M.1560591164.A.B9C.html',
  'post_id': 'M.1560591164.A.B9C',
  'timestamp': '1560591164',
  'title': '[新聞] 暴動!財經女神訪歐曬日光浴 白皙長腿惹',
  'nrec': '',
  'author': 'cycling',
  'mark': ''},
 {'url': '/bbs/Gossiping/M.1560591174.A.B05.html',
  'post_id': 'M.1560591174.A.B05',
  'timestamp': '1560591174',
  'title': '[新聞] 韓國瑜造勢到底多少人? 椅子精算師四叉貓算給你',
  'nrec': '',
  'author': 'sweat992001',
  'mark': ''},
 {'url': '/bbs/Gossiping/M.1560591182.A.50D.html',
  'post_id': 'M.1560591182.A.50D',
  'timestamp': '1560591182',
  'title': 'Re: [新聞] 大烏龍!攝影師砸30萬修MacBook 最後發現',
  'nrec': '',
  'author': 'YHOTV4096',
  'mark': ''},


		"url": str,
		"post_id": str,
		"timestamp": str,  # unix time
		"title": str,      
		"nrec": str,       # 推噓文相加總和
		"author": str,
		"mark":            # 標記


	"article_id": str,
    "article_title": str,
    "author": str,
    "board": str,
    "content": str,
    "timestamp": int,
    "ip": str,           # ipv4 address 
    "ip_country": str,   # ip <-> country mapping
    "message_count": {
         "all": str,     # 推、噓、箭頭總數
         "boo": str,     # 噓文
         "count": str,   # 推 - 噓文
         "neutral": str, # 箭頭
         "push": str,    # 推文
    "messages": [
    		"push_tag": str, # 評論符號
    		"push_userid": str,
    		"push_content": str,
    		"push_ipdatetime # ip 與時間(無日期)
    "url": str,
    "is_404": 是否刪文,