Update script that creates managed rule list used by Config (#6823)
This commit is contained in:
parent
5f1fb65038
commit
179a228741
File diff suppressed because it is too large
Load Diff
@ -1,19 +1,22 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
"""Download markdown files with AWS managed ConfigRule info and convert to JSON.
|
"""Scrape web-based docs for AWS managed ConfigRule info and convert to JSON.
|
||||||
|
|
||||||
Invocation: ./pull_down_aws_managed_rules.py
|
Invocation: ./pull_down_aws_managed_rules.py
|
||||||
|
- Install ../requirements-tests.txt packages to ensure the lxml package
|
||||||
|
is installed.
|
||||||
- Execute from the moto/scripts directory.
|
- Execute from the moto/scripts directory.
|
||||||
- To track download progress, use the "-v" command line switch.
|
- To track progress, use the "-v" command line switch.
|
||||||
- MANAGED_RULES_OUTPUT_FILENAME is the variable containing the name of
|
- MANAGED_RULES_OUTPUT_FILENAME is the variable with the output filename.
|
||||||
the file that will be overwritten when this script is run.
|
The file is overwritten when this script is successfully run.
|
||||||
|
|
||||||
NOTE: This script takes a while to download all the files.
|
NOTE: This script takes a while to scrape all the web pages. The
|
||||||
|
scraping could be parallelized, but since this script might only be
|
||||||
|
run once every couple of months, it wasn't worth the complexity.
|
||||||
|
|
||||||
Summary:
|
Summary:
|
||||||
The first markdown file is read to obtain the names of markdown files
|
An initial web page is parsed to obtain the links for all the other
|
||||||
for all the AWS managed config rules. Then each of those markdown files
|
docs for AWS managed config rules. Each of those links are parsed
|
||||||
are read and info is extracted with the final results written to a JSON
|
and the needed info is written to a JSON file.
|
||||||
file.
|
|
||||||
|
|
||||||
The JSON output will look as follows:
|
The JSON output will look as follows:
|
||||||
|
|
||||||
@ -31,6 +34,7 @@ Summary:
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"Trigger type": "Periodic"
|
"Trigger type": "Periodic"
|
||||||
|
"Resource type: "AWS::IAM::User"
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
...
|
...
|
||||||
@ -40,98 +44,118 @@ Summary:
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
from lxml import html
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
MANAGED_RULES_OUTPUT_FILENAME = "../moto/config/resources/aws_managed_rules.json"
|
MANAGED_RULES_OUTPUT_FILENAME = "../moto/config/resources/aws_managed_rules.json"
|
||||||
|
|
||||||
AWS_MARKDOWN_URL_START = "https://raw.githubusercontent.com/awsdocs/aws-config-developer-guide/main/doc_source/"
|
AWS_CONFIG_MANAGED_RULES_URL_START = (
|
||||||
|
"https://docs.aws.amazon.com/config/latest/developerguide/"
|
||||||
|
)
|
||||||
|
|
||||||
LIST_OF_MARKDOWNS_URL = "managed-rules-by-aws-config.md"
|
LIST_OF_RULES_URL = "managed-rules-by-aws-config.html"
|
||||||
|
|
||||||
|
|
||||||
def extract_param_info(line):
|
def extract_param_info(page_content):
|
||||||
"""Return dict containing parameter info extracted from line."""
|
"""Return dict containing parameter info extracted from page.
|
||||||
# Examples of parameter definitions:
|
|
||||||
# maxAccessKeyAgeType: intDefault: 90
|
|
||||||
# IgnorePublicAcls \(Optional\)Type: StringDefault: True
|
|
||||||
# MasterAccountId \(Optional\)Type: String
|
|
||||||
# endpointConfigurationTypesType: String
|
|
||||||
|
|
||||||
values = re.split(r":\s?", line)
|
The info for all (not each) parameters is contained within a "dl" tag,
|
||||||
name = values[0]
|
with "dt" tags providing the details. A "dt" tag without a colon
|
||||||
param_type = values[1]
|
provides the parameter name and indicates that the "dt" tags that follow
|
||||||
|
provide details for that parameter up until the next "dt" tag without a
|
||||||
|
colon or the end of the "dl" tag.
|
||||||
|
"""
|
||||||
|
dl_tags = page_content.xpath('//div[@class="variablelist"]//dl')
|
||||||
|
if len(dl_tags) > 1:
|
||||||
|
print(
|
||||||
|
f"ERROR: Found {len(dl_tags)} 'dl' tags for parameters; "
|
||||||
|
"only expecting one. Ignoring extra 'dl' tag.",
|
||||||
|
file=sys.stderr
|
||||||
|
)
|
||||||
|
|
||||||
# If there is no Optional keyword, then sometimes there
|
dt_tags = dl_tags[0].xpath(".//dt")
|
||||||
# isn't a space between the parameter name and "Type".
|
|
||||||
name = re.sub("Type$", "", name)
|
|
||||||
|
|
||||||
# Sometimes there isn't a space between the type and the
|
all_params = []
|
||||||
# word "Default".
|
param_details = {}
|
||||||
if "Default" in param_type:
|
for dt_tag in dt_tags:
|
||||||
param_type = re.sub("Default$", "", param_type)
|
text = dt_tag.text_content()
|
||||||
|
if not text or text == "None":
|
||||||
optional = False
|
|
||||||
if "Optional" in line:
|
|
||||||
optional = True
|
|
||||||
# Remove "Optional" from the line.
|
|
||||||
name = name.split()[0]
|
|
||||||
|
|
||||||
param_info = {
|
|
||||||
"Name": name,
|
|
||||||
"Optional": optional,
|
|
||||||
"Type": param_type,
|
|
||||||
}
|
|
||||||
|
|
||||||
# A default value isn't always provided.
|
|
||||||
if len(values) > 2:
|
|
||||||
param_info["Default"] = values[2]
|
|
||||||
|
|
||||||
return param_info
|
|
||||||
|
|
||||||
|
|
||||||
def extract_managed_rule_info(lines):
|
|
||||||
"""Return dict of qualifiers/rules extracted from a markdown file."""
|
|
||||||
rule_info = {}
|
|
||||||
label_pattern = re.compile(r"(?:\*\*)(?P<label>[^\*].*)\:\*\*\s?(?P<value>.*)?")
|
|
||||||
|
|
||||||
collecting_params = False
|
|
||||||
params = []
|
|
||||||
for line in lines:
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
line = line.replace("\\", "").strip()
|
|
||||||
|
|
||||||
# Parameters are listed in the lines following the label, so they
|
|
||||||
# require special processing.
|
|
||||||
if collecting_params:
|
|
||||||
# A new header marks the end of the parameters.
|
|
||||||
if line.startswith("##"):
|
|
||||||
rule_info["Parameters"] = params
|
|
||||||
break
|
|
||||||
|
|
||||||
if "Type: " in line:
|
|
||||||
params.append(extract_param_info(line))
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check for a label starting with two asterisks.
|
# If a colon is NOT present, this is the parameter name and not
|
||||||
matches = re.match(label_pattern, line)
|
# a key, value pair.
|
||||||
if not matches:
|
if ": " not in text:
|
||||||
continue
|
# If parameter info has been collected, save it and start a
|
||||||
|
# collection for this new parameter.
|
||||||
# Look for "Identifier", "Trigger type", "AWS Region" and
|
if param_details:
|
||||||
# "Parameters" labels and store the values for all but parameters.
|
all_params.append(param_details)
|
||||||
# Parameters values aren't on the same line as labels.
|
param_details = {}
|
||||||
label = matches.group("label")
|
if "Optional" in text:
|
||||||
value = matches.group("value")
|
text = text.split()[0]
|
||||||
if label in ["Identifier", "Trigger type", "AWS Region"]:
|
param_details["Optional"] = True
|
||||||
rule_info[label] = value
|
|
||||||
elif label == "Parameters":
|
|
||||||
collecting_params = True
|
|
||||||
else:
|
else:
|
||||||
print(f"ERROR: Unknown label: '{label}', line: '{line}'", file=sys.stderr)
|
param_details["Optional"] = False
|
||||||
|
param_details["Name"] = text
|
||||||
|
continue
|
||||||
|
|
||||||
|
key, value = text.split(": ")
|
||||||
|
param_details[key] = value
|
||||||
|
|
||||||
|
# Collect the last parameter found.
|
||||||
|
if param_details:
|
||||||
|
all_params.append(param_details)
|
||||||
|
|
||||||
|
return all_params
|
||||||
|
|
||||||
|
|
||||||
|
def extract_managed_rule_info(page_content):
|
||||||
|
"""Return dict of qualifiers/rules extracted from web page.
|
||||||
|
|
||||||
|
An example of the html that's being processed:
|
||||||
|
|
||||||
|
<div id="main-content" class="awsui-util-container">
|
||||||
|
...
|
||||||
|
|
||||||
|
<h1 class="topictitle" id="access-keys-rotated">access-keys-rotated</h1>
|
||||||
|
<p><b>Identifier:</b> ACCESS_KEYS_ROTATED</p>
|
||||||
|
<p><b>Resource Types:</b> AWS::IAM::User</p>
|
||||||
|
<p><b>Trigger type:</b> Periodic</p>
|
||||||
|
<p><b>AWS Region:</b> All supported AWS regions except Middle East (UAE),
|
||||||
|
Asia Pacific (Hyderabad), Asia Pacific (Melbourne), Israel (Tel Aviv),
|
||||||
|
Europe (Spain), Europe (Zurich) Region</p>
|
||||||
|
<p><b>Parameters:</b></p>
|
||||||
|
<div class="variablelist">
|
||||||
|
<dl>
|
||||||
|
<dt><span class="term">maxAccessKeyAge</span></dt>
|
||||||
|
<dt><span class="term">Type: int</span></dt>
|
||||||
|
<dt><span class="term">Default: 90</span></dt>
|
||||||
|
<dd>
|
||||||
|
<p>Maximum number of days without rotation. Default 90.</p>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
|
||||||
|
...
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
rule_info = {}
|
||||||
|
paragraphs = page_content.xpath('//div[@id="main-content"]/descendant::p')
|
||||||
|
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
text = paragraph.text_content()
|
||||||
|
if ": " not in text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
parts = text.split(": ")
|
||||||
|
if len(parts) > 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if parts[0] in ["Identifier", "Trigger type", "AWS Region", "Resource Types"]:
|
||||||
|
rule_info[parts[0]] = parts[1]
|
||||||
|
|
||||||
|
# The parameters are in their own "div", so handle them separately.
|
||||||
|
rule_info["Parameters"] = extract_param_info(page_content)
|
||||||
return rule_info
|
return rule_info
|
||||||
|
|
||||||
|
|
||||||
@ -139,35 +163,33 @@ def process_cmdline_args():
|
|||||||
"""Return parsed command line arguments."""
|
"""Return parsed command line arguments."""
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description=(
|
description=(
|
||||||
f"Download AWS config rules and merge output to create the "
|
"Scrape web pages with AWS config rules and merge results to "
|
||||||
f"JSON file {MANAGED_RULES_OUTPUT_FILENAME}"
|
f"create the JSON file {MANAGED_RULES_OUTPUT_FILENAME}"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-v", "--verbose", action="store_true", help="Report on progress of downloads"
|
"-v", "--verbose", action="store_true", help="Report on progress"
|
||||||
)
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Create a JSON file containing info pulled from AWS markdown files."""
|
"""Create a JSON file containing info pulled from AWS online docs."""
|
||||||
args = process_cmdline_args()
|
args = process_cmdline_args()
|
||||||
|
|
||||||
# Get the markdown file with links to the markdown files for services.
|
# Get the list of links for all the services.
|
||||||
req = requests.get(AWS_MARKDOWN_URL_START + LIST_OF_MARKDOWNS_URL)
|
page = requests.get(AWS_CONFIG_MANAGED_RULES_URL_START + LIST_OF_RULES_URL)
|
||||||
|
tree = html.fromstring(page.content)
|
||||||
|
links = [x.lstrip("./") for x in tree.xpath('//div[@class="highlights"]//ul//a/@href')]
|
||||||
|
|
||||||
# Extract the list of all the markdown files on the page.
|
# From each linked page, extract the id, region, trigger type and parameter
|
||||||
link_pattern = re.compile(r"\+ \[[^\]]+\]\(([^)]+)\)")
|
# information.
|
||||||
markdown_files = link_pattern.findall(req.text)
|
|
||||||
|
|
||||||
# For each of those markdown files, extract the id, region, trigger type
|
|
||||||
# and parameter information.
|
|
||||||
managed_rules = {"ManagedRules": {}}
|
managed_rules = {"ManagedRules": {}}
|
||||||
for markdown_file in markdown_files:
|
for link in links:
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
print(f"Downloading {markdown_file} ...")
|
print(f"Extracting from {link} ...")
|
||||||
req = requests.get(AWS_MARKDOWN_URL_START + markdown_file)
|
page = requests.get(AWS_CONFIG_MANAGED_RULES_URL_START + link)
|
||||||
rules = extract_managed_rule_info(req.text.split("\n"))
|
rules = extract_managed_rule_info(html.fromstring(page.content))
|
||||||
|
|
||||||
rule_id = rules.pop("Identifier")
|
rule_id = rules.pop("Identifier")
|
||||||
managed_rules["ManagedRules"][rule_id] = rules
|
managed_rules["ManagedRules"][rule_id] = rules
|
||||||
|
Loading…
Reference in New Issue
Block a user