Update script that creates managed rule list used by Config (#6823)
This commit is contained in:
parent
5f1fb65038
commit
179a228741
File diff suppressed because it is too large
Load Diff
@ -1,19 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
"""Download markdown files with AWS managed ConfigRule info and convert to JSON.
|
||||
"""Scrape web-based docs for AWS managed ConfigRule info and convert to JSON.
|
||||
|
||||
Invocation: ./pull_down_aws_managed_rules.py
|
||||
- Install ../requirements-tests.txt packages to ensure the lxml package
|
||||
is installed.
|
||||
- Execute from the moto/scripts directory.
|
||||
- To track download progress, use the "-v" command line switch.
|
||||
- MANAGED_RULES_OUTPUT_FILENAME is the variable containing the name of
|
||||
the file that will be overwritten when this script is run.
|
||||
- To track progress, use the "-v" command line switch.
|
||||
- MANAGED_RULES_OUTPUT_FILENAME is the variable with the output filename.
|
||||
The file is overwritten when this script is successfully run.
|
||||
|
||||
NOTE: This script takes a while to download all the files.
|
||||
NOTE: This script takes a while to scrape all the web pages. The
|
||||
scraping could be parallelized, but since this script might only be
|
||||
run once every couple of months, it wasn't worth the complexity.
|
||||
|
||||
Summary:
|
||||
The first markdown file is read to obtain the names of markdown files
|
||||
for all the AWS managed config rules. Then each of those markdown files
|
||||
are read and info is extracted with the final results written to a JSON
|
||||
file.
|
||||
An initial web page is parsed to obtain the links for all the other
|
||||
docs for AWS managed config rules. Each of those links are parsed
|
||||
and the needed info is written to a JSON file.
|
||||
|
||||
The JSON output will look as follows:
|
||||
|
||||
@ -31,6 +34,7 @@ Summary:
|
||||
}
|
||||
],
|
||||
"Trigger type": "Periodic"
|
||||
"Resource type: "AWS::IAM::User"
|
||||
},
|
||||
},
|
||||
...
|
||||
@ -40,98 +44,118 @@ Summary:
|
||||
import argparse
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
|
||||
from lxml import html
|
||||
import requests
|
||||
|
||||
MANAGED_RULES_OUTPUT_FILENAME = "../moto/config/resources/aws_managed_rules.json"
|
||||
|
||||
AWS_MARKDOWN_URL_START = "https://raw.githubusercontent.com/awsdocs/aws-config-developer-guide/main/doc_source/"
|
||||
AWS_CONFIG_MANAGED_RULES_URL_START = (
|
||||
"https://docs.aws.amazon.com/config/latest/developerguide/"
|
||||
)
|
||||
|
||||
LIST_OF_MARKDOWNS_URL = "managed-rules-by-aws-config.md"
|
||||
LIST_OF_RULES_URL = "managed-rules-by-aws-config.html"
|
||||
|
||||
|
||||
def extract_param_info(line):
|
||||
"""Return dict containing parameter info extracted from line."""
|
||||
# Examples of parameter definitions:
|
||||
# maxAccessKeyAgeType: intDefault: 90
|
||||
# IgnorePublicAcls \(Optional\)Type: StringDefault: True
|
||||
# MasterAccountId \(Optional\)Type: String
|
||||
# endpointConfigurationTypesType: String
|
||||
def extract_param_info(page_content):
|
||||
"""Return dict containing parameter info extracted from page.
|
||||
|
||||
values = re.split(r":\s?", line)
|
||||
name = values[0]
|
||||
param_type = values[1]
|
||||
The info for all (not each) parameters is contained within a "dl" tag,
|
||||
with "dt" tags providing the details. A "dt" tag without a colon
|
||||
provides the parameter name and indicates that the "dt" tags that follow
|
||||
provide details for that parameter up until the next "dt" tag without a
|
||||
colon or the end of the "dl" tag.
|
||||
"""
|
||||
dl_tags = page_content.xpath('//div[@class="variablelist"]//dl')
|
||||
if len(dl_tags) > 1:
|
||||
print(
|
||||
f"ERROR: Found {len(dl_tags)} 'dl' tags for parameters; "
|
||||
"only expecting one. Ignoring extra 'dl' tag.",
|
||||
file=sys.stderr
|
||||
)
|
||||
|
||||
# If there is no Optional keyword, then sometimes there
|
||||
# isn't a space between the parameter name and "Type".
|
||||
name = re.sub("Type$", "", name)
|
||||
dt_tags = dl_tags[0].xpath(".//dt")
|
||||
|
||||
# Sometimes there isn't a space between the type and the
|
||||
# word "Default".
|
||||
if "Default" in param_type:
|
||||
param_type = re.sub("Default$", "", param_type)
|
||||
all_params = []
|
||||
param_details = {}
|
||||
for dt_tag in dt_tags:
|
||||
text = dt_tag.text_content()
|
||||
if not text or text == "None":
|
||||
continue
|
||||
|
||||
optional = False
|
||||
if "Optional" in line:
|
||||
optional = True
|
||||
# Remove "Optional" from the line.
|
||||
name = name.split()[0]
|
||||
# If a colon is NOT present, this is the parameter name and not
|
||||
# a key, value pair.
|
||||
if ": " not in text:
|
||||
# If parameter info has been collected, save it and start a
|
||||
# collection for this new parameter.
|
||||
if param_details:
|
||||
all_params.append(param_details)
|
||||
param_details = {}
|
||||
if "Optional" in text:
|
||||
text = text.split()[0]
|
||||
param_details["Optional"] = True
|
||||
else:
|
||||
param_details["Optional"] = False
|
||||
param_details["Name"] = text
|
||||
continue
|
||||
|
||||
param_info = {
|
||||
"Name": name,
|
||||
"Optional": optional,
|
||||
"Type": param_type,
|
||||
}
|
||||
key, value = text.split(": ")
|
||||
param_details[key] = value
|
||||
|
||||
# A default value isn't always provided.
|
||||
if len(values) > 2:
|
||||
param_info["Default"] = values[2]
|
||||
# Collect the last parameter found.
|
||||
if param_details:
|
||||
all_params.append(param_details)
|
||||
|
||||
return param_info
|
||||
return all_params
|
||||
|
||||
|
||||
def extract_managed_rule_info(lines):
|
||||
"""Return dict of qualifiers/rules extracted from a markdown file."""
|
||||
def extract_managed_rule_info(page_content):
|
||||
"""Return dict of qualifiers/rules extracted from web page.
|
||||
|
||||
An example of the html that's being processed:
|
||||
|
||||
<div id="main-content" class="awsui-util-container">
|
||||
...
|
||||
|
||||
<h1 class="topictitle" id="access-keys-rotated">access-keys-rotated</h1>
|
||||
<p><b>Identifier:</b> ACCESS_KEYS_ROTATED</p>
|
||||
<p><b>Resource Types:</b> AWS::IAM::User</p>
|
||||
<p><b>Trigger type:</b> Periodic</p>
|
||||
<p><b>AWS Region:</b> All supported AWS regions except Middle East (UAE),
|
||||
Asia Pacific (Hyderabad), Asia Pacific (Melbourne), Israel (Tel Aviv),
|
||||
Europe (Spain), Europe (Zurich) Region</p>
|
||||
<p><b>Parameters:</b></p>
|
||||
<div class="variablelist">
|
||||
<dl>
|
||||
<dt><span class="term">maxAccessKeyAge</span></dt>
|
||||
<dt><span class="term">Type: int</span></dt>
|
||||
<dt><span class="term">Default: 90</span></dt>
|
||||
<dd>
|
||||
<p>Maximum number of days without rotation. Default 90.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
|
||||
...
|
||||
</div>
|
||||
"""
|
||||
rule_info = {}
|
||||
label_pattern = re.compile(r"(?:\*\*)(?P<label>[^\*].*)\:\*\*\s?(?P<value>.*)?")
|
||||
paragraphs = page_content.xpath('//div[@id="main-content"]/descendant::p')
|
||||
|
||||
collecting_params = False
|
||||
params = []
|
||||
for line in lines:
|
||||
if not line:
|
||||
continue
|
||||
line = line.replace("\\", "").strip()
|
||||
|
||||
# Parameters are listed in the lines following the label, so they
|
||||
# require special processing.
|
||||
if collecting_params:
|
||||
# A new header marks the end of the parameters.
|
||||
if line.startswith("##"):
|
||||
rule_info["Parameters"] = params
|
||||
break
|
||||
|
||||
if "Type: " in line:
|
||||
params.append(extract_param_info(line))
|
||||
for paragraph in paragraphs:
|
||||
text = paragraph.text_content()
|
||||
if ": " not in text:
|
||||
continue
|
||||
|
||||
# Check for a label starting with two asterisks.
|
||||
matches = re.match(label_pattern, line)
|
||||
if not matches:
|
||||
parts = text.split(": ")
|
||||
if len(parts) > 2:
|
||||
continue
|
||||
|
||||
# Look for "Identifier", "Trigger type", "AWS Region" and
|
||||
# "Parameters" labels and store the values for all but parameters.
|
||||
# Parameters values aren't on the same line as labels.
|
||||
label = matches.group("label")
|
||||
value = matches.group("value")
|
||||
if label in ["Identifier", "Trigger type", "AWS Region"]:
|
||||
rule_info[label] = value
|
||||
elif label == "Parameters":
|
||||
collecting_params = True
|
||||
else:
|
||||
print(f"ERROR: Unknown label: '{label}', line: '{line}'", file=sys.stderr)
|
||||
if parts[0] in ["Identifier", "Trigger type", "AWS Region", "Resource Types"]:
|
||||
rule_info[parts[0]] = parts[1]
|
||||
|
||||
# The parameters are in their own "div", so handle them separately.
|
||||
rule_info["Parameters"] = extract_param_info(page_content)
|
||||
return rule_info
|
||||
|
||||
|
||||
@ -139,35 +163,33 @@ def process_cmdline_args():
|
||||
"""Return parsed command line arguments."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
f"Download AWS config rules and merge output to create the "
|
||||
f"JSON file {MANAGED_RULES_OUTPUT_FILENAME}"
|
||||
"Scrape web pages with AWS config rules and merge results to "
|
||||
f"create the JSON file {MANAGED_RULES_OUTPUT_FILENAME}"
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="Report on progress of downloads"
|
||||
"-v", "--verbose", action="store_true", help="Report on progress"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
"""Create a JSON file containing info pulled from AWS markdown files."""
|
||||
"""Create a JSON file containing info pulled from AWS online docs."""
|
||||
args = process_cmdline_args()
|
||||
|
||||
# Get the markdown file with links to the markdown files for services.
|
||||
req = requests.get(AWS_MARKDOWN_URL_START + LIST_OF_MARKDOWNS_URL)
|
||||
# Get the list of links for all the services.
|
||||
page = requests.get(AWS_CONFIG_MANAGED_RULES_URL_START + LIST_OF_RULES_URL)
|
||||
tree = html.fromstring(page.content)
|
||||
links = [x.lstrip("./") for x in tree.xpath('//div[@class="highlights"]//ul//a/@href')]
|
||||
|
||||
# Extract the list of all the markdown files on the page.
|
||||
link_pattern = re.compile(r"\+ \[[^\]]+\]\(([^)]+)\)")
|
||||
markdown_files = link_pattern.findall(req.text)
|
||||
|
||||
# For each of those markdown files, extract the id, region, trigger type
|
||||
# and parameter information.
|
||||
# From each linked page, extract the id, region, trigger type and parameter
|
||||
# information.
|
||||
managed_rules = {"ManagedRules": {}}
|
||||
for markdown_file in markdown_files:
|
||||
for link in links:
|
||||
if args.verbose:
|
||||
print(f"Downloading {markdown_file} ...")
|
||||
req = requests.get(AWS_MARKDOWN_URL_START + markdown_file)
|
||||
rules = extract_managed_rule_info(req.text.split("\n"))
|
||||
print(f"Extracting from {link} ...")
|
||||
page = requests.get(AWS_CONFIG_MANAGED_RULES_URL_START + link)
|
||||
rules = extract_managed_rule_info(html.fromstring(page.content))
|
||||
|
||||
rule_id = rules.pop("Identifier")
|
||||
managed_rules["ManagedRules"][rule_id] = rules
|
||||
|
Loading…
Reference in New Issue
Block a user