cpprobotparser
This module represents the robots.txt parser written in C++, which answers questions about whether or not a particular user agent can fetch a URL on the Web site that published the robots.txt file. For more details on the structure of robots.txt files, see http://www.robotstxt.org/orig.html.
How to determine which URLs are allowed?
For this purpose the library is exported RobotsTxtRules
.
Below is the example of using RobotsTxtRules
.
Example
#include <iostream>
#include <cpprobotparser.hpp>
using namespace cpprobotparser;
int main(int, char**)
{
const RobotsTxtRules rules(R"(
Sitemap: www.example.com/sitemap.xml
User-agent: Googlebot
Disallow: /oembedB
Disallow: /*/forks
Disallow: /*/*/commits/*/*
Disallow: /*/*/commits/*?author
Disallow: /*/*/commits/*?path
Allow: /*/*/tree/master
Allow: /*/*/blob/master)");
const std::vector<std::string> urlToCheck
{
"http://example.com/index.php",
"http://example.com/oembedB",
"http://example.com/folder1/folder2/commits/article.php?author",
"http://example.com/1/2/blob/master"
};
std::cout << "Rules for GoogleBot:\n";
for (const std::string& url : urlToCheck)
{
const bool isAllowed = rules.isUrlAllowed(url, WellKnownUserAgent::GoogleBot);
std::cout << "The URL: " << url << (isAllowed ? " is allowed" : " is not allowed") << "\n";
}
return 0;
}
Result
Rules for GoogleBot:
The URL: http://example.com/index.php is allowed
The URL: http://example.com/oembedB is not allowed
The URL: http://example.com/folder1/folder2/commits/article.php?author is not allowed
The URL: http://example.com/1/2/blob/master is allowed
Example Of Incorporating Into An Existing CMake Project Using MSVC
Assume that we have a project which consists of one src
folder:
cmake_minimum_required(VERSION 3.2)
include(ExternalProject)
externalproject_add(cpprobotparser
GIT_REPOSITORY https://github.com/andrascii/cpprobotparser
GIT_TAG master
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/cpprobotparser-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/cpprobotparser-build"
CMAKE_ARGS -DBUILD_TESTS=OFF -DUSE_DYNAMIC_CXX_RUNTIME=ON -DBUILD_AS_SHARED=ON
INSTALL_COMMAND ""
UPDATE_COMMAND ""
)
externalproject_get_property(cpprobotparser Source_Dir Binary_Dir)
# Path to the single `cpprobotparser.hpp` header file
set(CPPROBOTPARSER_INCLUDE_DIR "${Source_Dir}/single_include")
# Path to the debug binary directory created by default of microsoft compiler
set(CPPROBOTPARSER_DEBUG_BINARY_DIR "${Binary_Dir}/Debug")
# Path to the release binary directory created by default of microsoft compiler
set(CPPROBOTPARSER_RELEASE_BINARY_DIR "${Binary_Dir}/Release")
set(TEST_PROJECT testproject)
project(${TEST_PROJECT})
set(SOURCES_DIR src)
# collect sources and headers of our main project
aux_source_directory(${SOURCES_DIR} SOURCES_LIST)
file(GLOB_RECURSE HEADERS_LIST "src/*.h")
add_executable(
${TEST_PROJECT}
${SOURCES_LIST}
${HEADERS_LIST}
)
set(CMAKE_CXX_STANDARD 17)
include_directories(${INCLUDE_DIR} ${CPPROBOTPARSER_INCLUDE_DIR})
# firstly we will build the cpprobotparser
add_dependencies(${TEST_PROJECT} cpprobotparser)
# finally link our project with a cpprobotparser.lib
target_link_libraries(${TEST_PROJECT}
"$<$<CONFIG:Debug>:${CPPROBOTPARSER_DEBUG_BINARY_DIR}/cpprobotparser.lib>" # for debug version
"$<$<CONFIG:Release>:${CPPROBOTPARSER_RELEASE_BINARY_DIR}/cpprobotparser.lib>" # for release version
)