This repository was archived by the owner on Sep 1, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcmdline.cpp
More file actions
94 lines (87 loc) · 3.02 KB
/
cmdline.cpp
File metadata and controls
94 lines (87 loc) · 3.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#include "./cmdline.hpp"
#include <iostream>
#include <fstream>
#include "rippy.hpp"
const char* help = R""""(Rippy is a command-line based tool for scraping webpages.
Usage:
rippy help - Display this help message
rippy create <name> - Create a new project (project.yml) containing the settings and rules for each domain
rippy start - Run the project in the current directory.
For more information, see https://github.com/vortexdevsoftware/Rippy)"""";
// The initial project.yml file that is created when the user runs "create"
const char* defaultProjectConfig = R""""(
# Crawler configuration (YAML)
# The user agent to use when scraping, this is used to identify the scraper to the server.
userAgent: Rippy/1.0
threads: 4 # increasing this will increase the speed of the scraper, but will also increase the load on the network.
depth: 0 # disable depth limit, (e.g. 10000 would limit the scraper to 10000 pages)
saveSession: true # save the session to a file, so that it can be resumed if the program is interrupted
domains:
- domain: https://en.wikipedia.org
filter_mode: blacklist # will skip any subpages that begin with the strings in the filter list whereas whitelist will only visit pages that begin with the strings in the filter list.
start_pages:
- /wiki/Main_Page
filter:
- /w/index.php?title=Special
rules:
- tag: span
attribute: class
value: mw-page-title-main
- tag: div
attribute: id
value: mw-content-text
output: output.txt
)"""";
// Returns 1 if the program should exit, 0 only if the user used the "run" command
int parseArgs(int argc, char* argv[])
{
// If user ran the program without any arguments, print the help message
if (argc == 1)
{
std::cout << help;
return 1;
}
// Extract the first argument
std::string arg1 = argv[1];
// If the user asked for help, print the help message
if (arg1 == "help")
{
std::cout << help;
return 1;
}
// If the user asked to create a new project, check that they supplied a name
// for the project, and then create a file called project.yml with the
// project name in it.
else if (arg1 == "create")
{
if (argc == 2)
{
std::cout << "Please specify a project name.\r";
return 1;
}
std::string arg2 = argv[2];
// add defaultProjectConfig to project.yml
std::ofstream file("project.yml");
if (file.fail())
{
std::cout << "Error creating project.yml\r";
return 1;
}
file << defaultProjectConfig;
file.close();
std::cout << "Created project.yml\r";
return 1;
}
// If the user asked to run the project, return 0 to indicate that we should
// continue to run the project.
else if (arg1 == "start")
{
return 0;
}
// If the user supplied an unknown command, print an error message
else
{
std::cout << "Unknown command: " << arg1 << "\r";
return 1;
}
}