
A guide for extracting titles, authors, and citations from Google Scholar using Python and Oxylabs SERP Scraper API.

Primary LanguagePython

How to Scrape Google Scholar

Oxylabs promo code

Take a look at the process of getting titles, authors, and citations from Google Scholar using Oxylabs SERP Scraper API (1-week free trial) and Python.

For a detailed walkthrough with explanations and visuals, check our blog post.

The complete code

import requests
from bs4 import BeautifulSoup


def get_html_for_page(url):
    payload = {
        "url": url,
        "source": "google",
    response = requests.post(
        auth=(USERNAME, PASSWORD),
    return response.json()["results"][0]["content"]

def get_citations(article_id):
    url = f"https://scholar.google.com/scholar?q=info:{article_id}:scholar.google.com&output=cite"
    html = get_html_for_page(url)
    soup = BeautifulSoup(html, "html.parser")
    data = []
    for citation in soup.find_all("tr"):
        title = citation.find("th", {"class": "gs_cith"}).get_text(strip=True)
        content = citation.find("div", {"class": "gs_citr"}).get_text(strip=True)
        entry = {
            "title": title,
            "content": content,

    return data

def parse_data_from_article(article):
    title_elem = article.find("h3", {"class": "gs_rt"})
    title = title_elem.get_text()
    title_anchor_elem = article.select("a")[0]
    url = title_anchor_elem["href"]
    article_id = title_anchor_elem["id"]
    authors = article.find("div", {"class": "gs_a"}).get_text()
    return {
        "title": title,
        "authors": authors,
        "url": url,
        "citations": get_citations(article_id),

def get_url_for_page(url, page_index):
    return url + f"&start={page_index}"

def get_data_from_page(url):
    html = get_html_for_page(url)
    soup = BeautifulSoup(html, "html.parser")
    articles = soup.find_all("div", {"class": "gs_ri"})
    return [parse_data_from_article(article) for article in articles]

data = []
url = "https://scholar.google.com/scholar?q=global+warming+&hl=en&as_sdt=0,5"

page_index = 0
for _ in range(NUM_OF_PAGES):
    page_url = get_url_for_page(url, page_index)
    entries = get_data_from_page(page_url)
    page_index += 10


Final word

Check our documentation for more API parameters and variables found in this tutorial.

If you have any questions, feel free to contact us at support@oxylabs.io.