/ARXIV-NLP-Analysis

Just a repo to show my technical skills in NLP, on scraped data from arxiv scientific papers.

Primary LanguageHTMLBSD 3-Clause "New" or "Revised" LicenseBSD-3-Clause

ARXIV NLP Analysis

Tasks

  • Scraping
  • Topic prediction
  • RAG

Scraping

For this project, I will work with a dataset of scientific papers abstracts from arxiv. The first step is to get the labels, from the main page of arxiv. I will need them to assign the subcategories of scraped papers to a wider class.

Imports

import arxiv
import re
import requests
import srsly
from attrs import define, field, converters, asdict
from bs4 import BeautifulSoup
from typing import Iterable, Union, List, Set
from tqdm import tqdm

Paper_abstract class

@define
class Paper_abstract:
    """ Create an instance of paper_abstract.
    A paper_abstract has:
    - a title(str)
    - some authors (List[str])
    - a raw abstract (str)
    - a publication date
    - an url (str)
    - one or several categories (List[str])
    - the processed abstract (str)"""
    title: str = field()
    authors: Union[List, Set] = field()
    raw_abstract:str = field()
    publication_date = field()
    paper_link:str = field()
    categories: Union[List, Set] = field(converter=list)
    abstract:str = field()
    @abstract.default
    def clean_abstract(self):
        """Replace latex formulas by "__FORMULA__"
        and remove latex formatting functions."""
        processed_abstract = ' '.join(self.raw_abstract.splitlines())
        processed_abstract = re.sub(r'\$[^\$]+\$', r'__FORMULA__', processed_abstract)
        processed_abstract = re.sub(r'\\[a-zA-Z]+{([^}]*)}', r'\1', processed_abstract)
        return processed_abstract

Extract abstracts

get_labels

def get_labels(url:str ='https://arxiv.org/' ) -> dict:
    """Get the name of categories and the tag of sub-categories.
    Return a dictionary of sub-category tag: category"""
    request = requests.get(url)
    soup = BeautifulSoup(request.content, 'lxml')
    content = soup.find('div', id='content')
    relevant_tags = content.find_all(['h2', 'strong'])
    category = None
    sub_cat = None
    sub_cat_to_cat = dict()
    for tag in relevant_tags:
      if '<h2>' in str(tag):
        category = tag.get_text()
      elif '</strong>' in str(tag) and category:
        sub_cat_to_cat[tag.get_text()] = category
    return sub_cat_to_cat

Map subcategories to a label from a website information (default: https://arxiv.org/)

fill_categories

def fill_categories(map_labels_dict:dict, category_results: list, current_label:str ) -> list:
    """Assign one or several labels to an abstract.
    If a category result can be assigned to the abstract using map_label_dict, return the values
    that correspond to the relevant sub-categories.
    If at the end, no categories can be assigned to the abstract, assign the current label."""
    categories = {map_labels_dict[subcat.split('.')[0]]
                  for subcat in category_results
                  if subcat.split('.')[0] in map_labels_dict}
    return categories if len(categories)>0 else [current_label]

Assign one or several categories to an abstract. If no category can be assigned, assign the named of the current scraped category.

get_abstracts

def get_abstracts(n_abstracts=100, save = False):
    """Get labels,
    then scrap n_abstracts (default:100) abstracts from arxiv per label."""
    client = arxiv.Client()
    subcat_to_label = get_labels()
    abstracts = []
    titles = set()
    # Extract main labels.
    for label in set(subcat_to_label.values()):
        print(f'{label} abstracts:')
        search = arxiv.Search(
        query = label,
        max_results = n_abstracts,
        sort_by = arxiv.SortCriterion.SubmittedDate
        )
        for r in tqdm(client.results(search)):
            # Scrap articles, add it if not already scrapped
            if r.title not in titles:
                titles.add(r.title)
                paper = Paper_abstract(title=r.title,
                                         authors = [author.name for author in r.authors],
                                         raw_abstract = r.summary,
                                         publication_date = r.published,
                                         paper_link =  r.pdf_url,
                                         categories = fill_categories(subcat_to_label, r.categories, label) )
                abstracts.append(paper)
        # Save into a jsonl file if wanted.
    if save:
        [a.abstract for a in abstracts]
        srsly.write_jsonl('data_models/abstracts.jsonl',
                          [asdict(abstract) for abstract in abstracts])
        print(f'Saved {len(abstracts)} articles')
    return abstracts

Extracts n (default: 100) abstracts per category and their respective metadata.Remove duplicates.

main

if __name__ == '__main__':
    get_abstracts(200, save=True)

Topic modeling

Get the topics of a text input. Topics are a probability and a list of words that may belong to the associated topic.

Imports

from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from transformers.pipelines import pipeline
# from more_itertools import collapse
from tqdm import tqdm
import spacy
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer


nlp = spacy.load('en_core_web_trf', exclude = ['ner'])

Bertopic

bertopic_model

def bertopic_model(texts):
    """
    This function create a BERTopic model object with
        * embedding_model=embeddings (SentenceTransformer),
        * umap_model=umap_model,
        * hdbscan_model=hdbscan_model,
        * vectorizer_model=vectorizer_model,
        * ctfidf_model=ctfidf_model.
    The hyperparameters are also defined: top_n_words=10 and verbose=True.

    This function save the topic model as "bertopic_".
    It also create two html plots for topics and documents.
    """
    umap_model = UMAP(n_neighbors=20, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    vectorizer_model = CountVectorizer(stop_words="english", max_features=800, ngram_range=(1, 4))
    ctfidf_model = ClassTfidfTransformer()#bm25_weighting=True, reduce_frequent_words=True)
    embeddings = SentenceTransformer("all-MiniLM-L6-v2")
    topic_model = BERTopic(
        embedding_model=embeddings,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,

        top_n_words=10,
        verbose=True)

    topics, probs = topic_model.fit_transform(texts)
    # save model
    topic_model.save("data_models/bertopic_", serialization="pickle")

    embed_docs = embeddings.encode(texts, show_progress_bar=True)
    reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embed_docs)

    # topic visualization
    fig1 = topic_model.visualize_topics()
    fig1.write_html('data_models/topics.html')
    # document visualization
    fig2=topic_model.visualize_documents(texts, reduced_embeddings=reduced_embeddings)
    fig2.write_html("data_models/documents.html")
    # Return model
    return topic_model



Preprocessing

Preprocessing functions

Imports

import spacy
nlp = spacy.load('en_core_web_trf')

sentencize

def sentencize(texts):
    """Sentencize a text using spacy"""
    if isinstance(texts, str):
        # Put the string into a list
        texts = [texts]
    # Sentencize texts
    sentences = [[sent.text.strip() for sent in doc.sents]
                 for doc in nlp.pipe(texts, disable=['ner', 'tagger', 'attribute_ruler', 'lemmatizer'])]
    return sentences

RAG

Use a local LLM (here mistral 7b instruct) and a vector database to answer a query over documents.

Imports

from pathlib import Path

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
from langchain.document_loaders import JSONLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.llms import LlamaCpp
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(
    model_path=f'{Path.home()}/llms/mistral-7b-instruct-v0.1.Q4_K_M.gguf',
    temperature=0,
    max_tokens=2000,
    n_ctx = 1024,
    top_p=1,
    n_batch = 512,
    callback_manager=callback_manager,
    verbose=True,
    streaming=True,
    stop=['User:'],
    f16_kv=True,
)
embeddings_name = "BAAI/bge-small-en"
embeddings_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=embeddings_name, model_kwargs=embeddings_kwargs,
    encode_kwargs=encode_kwargs
)

RAG

Document loading

def metadata_func(record: dict, metadata: dict) -> dict:
    """Extract metadata: paper title and authors name"""
    metadata["title"] = record.get("title")
    metadata["authors"] = ', '. join(record.get("authors"))
    return metadata


def load_documents(path):
    """Load a jsonl file to get the abstracts and their metadata"""
    loader = JSONLoader(
        file_path=path,
        jq_schema=".",
        content_key="abstract",
        json_lines=True,
        metadata_func=metadata_func,
    )

    data = loader.load()
    return data

build_vector_store

def build_vector_store(path='data_models/abstracts.jsonl',
                       persist_dir = 'db'):
    """Build the vector store
    - If db is a folder: load it
    - Else, chunk the documents, and create a db. Save it locally in the db folder"""
    if Path(persist_dir).is_dir():
        vectordb = Chroma(persist_directory=persist_dir,
                          embedding_function=embeddings)
    else:
        documents = load_documents(path)
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=300,
                                                       chunk_overlap=50)
        texts = text_splitter.split_documents(documents)

        vectordb = Chroma.from_documents(documents=texts,
                                         embedding=embeddings,
                                         persist_directory=persist_dir)
        vectordb.persist()

    return vectordb


db = build_vector_store()
def format_chat_prompt(message):
    """Format prompt"""
    instruction = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.
    """
    prompt = f"System:{instruction}"
    prompt = f"{prompt}\nUser: <s>[INST]{message}[/INST]\n"
    return prompt

query the vector store

def semantic_search(query, search_type="similarity"):
    """Send a request to the local LLM using the DB"""
    retriever = db.as_retriever(search=search_type, search_kwargs={"k": 7})
    qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                           chain_type="stuff",
                                           retriever=retriever,
                                           return_source_documents=True)
    prompt = format_chat_prompt(query)
    response = qa_chain(prompt)
    return response

FastAPI

Models

from pydantic import BaseModel
from scraping import Paper_abstract
from typing import Union, List, Dict, Tuple

class AbstractsModel(BaseModel):
    item_id: int
    abstracts: list

class TopicModel(BaseModel):
    input_: str
    response: dict[float, list]

class QueryResponseModel(BaseModel):
    query: str
    response: str
    source_documents: List[Tuple]

App

from typing import Union
from uuid import uuid4
from fastapi import FastAPI
from scraping import get_abstracts
from pathlib import Path
import api_models
from topic_modeling import bertopic_model
from bertopic import BERTopic
from rag import semantic_search
from attrs import asdict
app = FastAPI()


with open('data_models/Falk_et_al_2023.txt', 'r') as f:
    text_input = f.read()

@app.get("/")
def read_root():
    return {"Hello": "World"}


@app.post("/abstracts", response_model =api_models.AbstractsModel, tags=['SCRAPPING'])
def abstracts(n_abstract:int=100, save=False):
    """Abstracts extraction."""
    item_id = uuid4()
    response = get_abstracts(n_abstract, save =False)
    dict_abstracts = [asdict(abstract) for abstract in response]
    return {"item_id": item_id.int, "abstracts": dict_abstracts}

@app.post("/topic", response_model =api_models.TopicModel, tags=['TOPIC_MODELING'])
def predict_topic(text_input: str= text_input):
    """Let an text input (abstract expected), return its topics (a list of words)
    If no model is available, we must create before running inference."""
    topic_model = BERTopic.load('data_models/bertopic_') if Path('data_models/bertopic_').is_file() else None
    if not topic_model:
        data = [paper['abstract'] for paper in abstracts(100)['abstracts']]
        topic_model = bertopic_model(data)
    inference = topic_model.transform(text_input)
    data = {}
    for topic_nb, prob in zip(inference[0], inference[1]):
        topic = [w[0] for w in topic_model.get_topic(topic_nb)]
        data[prob]=topic
    return {"input_": text_input, "response": data}

@app.post("/rag", response_model =api_models.QueryResponseModel, tags=['RAG'])
def run_semantic_search(query="What is skeleton gait used for?", search_type='similarity'):
    """Use a vector database and a local LLM to answer a query."""
    response = semantic_search(query, search_type)
    sources = [(source.metadata['title'], source.metadata['authors'],source.page_content )
               for source in response['source_documents']]
    return {"query": query, "response": response['result'], "source_documents": sources}

Docker

Run docker build -t arxiv . to build the image Run docker run -p 8050:8050 arxiv to run the image Then go to localhost:8050/docs to access the API

NB: sentence-transformer model (“all-MiniLM-L6-v2”) will be downloaded at each start of the image. This can be avoided by downloading and saving it locally when building the image. I do not make it here to have different model paths in my local computer and in the docker container.