Update script that creates managed rule list used by Config (#6823)

2023-09-20 03:30:58 -04:00 · 2023-09-20 03:30:58 -04:00 · 179a228741
commit 179a228741
parent 5f1fb65038
2 changed files with 1106 additions and 378 deletions
--- a/moto/config/resources/aws_managed_rules.json
+++ b/moto/config/resources/aws_managed_rules.json
--- a/scripts/pull_down_aws_managed_rules.py
+++ b/scripts/pull_down_aws_managed_rules.py
@ -1,19 +1,22 @@
 #!/usr/bin/env python
-"""Download markdown files with AWS managed ConfigRule info and convert to JSON.
+"""Scrape web-based docs for AWS managed ConfigRule info and convert to JSON.
 Invocation:  ./pull_down_aws_managed_rules.py
    - Install ../requirements-tests.txt packages to ensure the lxml package
      is installed.
    - Execute from the moto/scripts directory.
-    - To track download progress, use the "-v" command line switch.
+    - To track progress, use the "-v" command line switch.
-    - MANAGED_RULES_OUTPUT_FILENAME is the variable containing the name of
+    - MANAGED_RULES_OUTPUT_FILENAME is the variable with the output filename.
-      the file that will be overwritten when this script is run.
+      The file is overwritten when this script is successfully run.
-    NOTE:  This script takes a while to download all the files.
+    NOTE:  This script takes a while to scrape all the web pages.  The
    scraping could be parallelized, but since this script might only be
    run once every couple of months, it wasn't worth the complexity.
 Summary:
-    The first markdown file is read to obtain the names of markdown files
+    An initial web page is parsed to obtain the links for all the other
-    for all the AWS managed config rules.  Then each of those markdown files
+    docs for AWS managed config rules.  Each of those links are parsed
-    are read and info is extracted with the final results written to a JSON
+    and the needed info is written to a JSON file.
    file.
    The JSON output will look as follows:
@ -31,6 +34,7 @@ Summary:
                            }
                    ],
                    "Trigger type": "Periodic"
                    "Resource type:  "AWS::IAM::User"
                },
            },
            ...
@ -40,98 +44,118 @@ Summary:
 import argparse
 import json
 import re
 import sys
 from lxml import html
 import requests
 MANAGED_RULES_OUTPUT_FILENAME = "../moto/config/resources/aws_managed_rules.json"
-AWS_MARKDOWN_URL_START = "https://raw.githubusercontent.com/awsdocs/aws-config-developer-guide/main/doc_source/"
+AWS_CONFIG_MANAGED_RULES_URL_START = (
    "https://docs.aws.amazon.com/config/latest/developerguide/"
 )
-LIST_OF_MARKDOWNS_URL = "managed-rules-by-aws-config.md"
+LIST_OF_RULES_URL = "managed-rules-by-aws-config.html"
-def extract_param_info(line):
+def extract_param_info(page_content):
-    """Return dict containing parameter info extracted from line."""
+    """Return dict containing parameter info extracted from page.
    # Examples of parameter definitions:
    #   maxAccessKeyAgeType: intDefault: 90
    #   IgnorePublicAcls \(Optional\)Type: StringDefault: True
    #   MasterAccountId \(Optional\)Type: String
    #   endpointConfigurationTypesType: String
-    values = re.split(r":\s?", line)
+    The info for all (not each) parameters is contained within a "dl" tag,
-    name = values[0]
+    with "dt" tags providing the details.  A "dt" tag without a colon
-    param_type = values[1]
+    provides the parameter name and indicates that the "dt" tags that follow
    provide details for that parameter up until the next "dt" tag without a
    colon or the end of the "dl" tag.
    """
    dl_tags = page_content.xpath('//div[@class="variablelist"]//dl')
    if len(dl_tags) > 1:
        print(
            f"ERROR: Found {len(dl_tags)} 'dl' tags for parameters; "
            "only expecting one.  Ignoring extra 'dl' tag.",
            file=sys.stderr
        )
-    # If there is no Optional keyword, then sometimes there
+    dt_tags = dl_tags[0].xpath(".//dt")
    # isn't a space between the parameter name and "Type".
    name = re.sub("Type$", "", name)
-    # Sometimes there isn't a space between the type and the
+    all_params = []
-    # word "Default".
+    param_details = {}
-    if "Default" in param_type:
+    for dt_tag in dt_tags:
-        param_type = re.sub("Default$", "", param_type)
+        text = dt_tag.text_content()
-
+        if not text or text == "None":
    optional = False
    if "Optional" in line:
        optional = True
        # Remove "Optional" from the line.
        name = name.split()[0]
    param_info = {
        "Name": name,
        "Optional": optional,
        "Type": param_type,
    }
    # A default value isn't always provided.
    if len(values) > 2:
        param_info["Default"] = values[2]
    return param_info
 def extract_managed_rule_info(lines):
    """Return dict of qualifiers/rules extracted from a markdown file."""
    rule_info = {}
    label_pattern = re.compile(r"(?:\*\*)(?P<label>[^\*].*)\:\*\*\s?(?P<value>.*)?")
    collecting_params = False
    params = []
    for line in lines:
        if not line:
            continue
        line = line.replace("\\", "").strip()
        # Parameters are listed in the lines following the label, so they
        # require special processing.
        if collecting_params:
            # A new header marks the end of the parameters.
            if line.startswith("##"):
                rule_info["Parameters"] = params
                break
            if "Type: " in line:
                params.append(extract_param_info(line))
            continue
-        # Check for a label starting with two asterisks.
+        # If a colon is NOT present, this is the parameter name and not
-        matches = re.match(label_pattern, line)
+        # a key, value pair.
-        if not matches:
+        if ": " not in text:
-            continue
+            # If parameter info has been collected, save it and start a
-
+            # collection for this new parameter.
-        # Look for "Identifier", "Trigger type", "AWS Region" and
+            if param_details:
-        # "Parameters" labels and store the values for all but parameters.
+                all_params.append(param_details)
-        # Parameters values aren't on the same line as labels.
+                param_details = {}
-        label = matches.group("label")
+            if "Optional" in text:
-        value = matches.group("value")
+                text = text.split()[0]
-        if label in ["Identifier", "Trigger type", "AWS Region"]:
+                param_details["Optional"] = True
            rule_info[label] = value
        elif label == "Parameters":
            collecting_params = True
            else:
-            print(f"ERROR:  Unknown label: '{label}', line: '{line}'", file=sys.stderr)
+                param_details["Optional"] = False
            param_details["Name"] = text
            continue
        key, value = text.split(": ")
        param_details[key] = value
    # Collect the last parameter found.
    if param_details:
        all_params.append(param_details)
    return all_params
 def extract_managed_rule_info(page_content):
    """Return dict of qualifiers/rules extracted from web page.
    An example of the html that's being processed:
    <div id="main-content" class="awsui-util-container">
    ...
    <h1 class="topictitle" id="access-keys-rotated">access-keys-rotated</h1>
    <p><b>Identifier:</b> ACCESS_KEYS_ROTATED</p>
    <p><b>Resource Types:</b> AWS::IAM::User</p>
    <p><b>Trigger type:</b> Periodic</p>
    <p><b>AWS Region:</b> All supported AWS regions except Middle East (UAE),
        Asia Pacific (Hyderabad), Asia Pacific (Melbourne), Israel (Tel Aviv),
        Europe (Spain), Europe (Zurich) Region</p>
    <p><b>Parameters:</b></p>
    <div class="variablelist">
    <dl>
        <dt><span class="term">maxAccessKeyAge</span></dt>
        <dt><span class="term">Type: int</span></dt>
        <dt><span class="term">Default: 90</span></dt>
          <dd>
             <p>Maximum number of days without rotation. Default 90.</p>
          </dd>
      </dl>
    ...
    </div>
    """
    rule_info = {}
    paragraphs = page_content.xpath('//div[@id="main-content"]/descendant::p')
    for paragraph in paragraphs:
        text = paragraph.text_content()
        if ": " not in text:
            continue
        parts = text.split(": ")
        if len(parts) > 2:
            continue
        if parts[0] in ["Identifier", "Trigger type", "AWS Region", "Resource Types"]:
            rule_info[parts[0]] = parts[1]
    # The parameters are in their own "div", so handle them separately.
    rule_info["Parameters"] = extract_param_info(page_content)
    return rule_info
@ -139,35 +163,33 @@ def process_cmdline_args():
    """Return parsed command line arguments."""
    parser = argparse.ArgumentParser(
        description=(
-            f"Download AWS config rules and merge output to create the "
+            "Scrape web pages with AWS config rules and merge results to "
-            f"JSON file {MANAGED_RULES_OUTPUT_FILENAME}"
+            f"create the JSON file {MANAGED_RULES_OUTPUT_FILENAME}"
        )
    )
    parser.add_argument(
-        "-v", "--verbose", action="store_true", help="Report on progress of downloads"
+        "-v", "--verbose", action="store_true", help="Report on progress"
    )
    return parser.parse_args()
 def main():
-    """Create a JSON file containing info pulled from AWS markdown files."""
+    """Create a JSON file containing info pulled from AWS online docs."""
    args = process_cmdline_args()
-    # Get the markdown file with links to the markdown files for services.
+    # Get the list of links for all the services.
-    req = requests.get(AWS_MARKDOWN_URL_START + LIST_OF_MARKDOWNS_URL)
+    page = requests.get(AWS_CONFIG_MANAGED_RULES_URL_START + LIST_OF_RULES_URL)
    tree = html.fromstring(page.content)
    links = [x.lstrip("./") for x in tree.xpath('//div[@class="highlights"]//ul//a/@href')]
-    # Extract the list of all the markdown files on the page.
+    # From each linked page, extract the id, region, trigger type and parameter
-    link_pattern = re.compile(r"\+ \[[^\]]+\]\(([^)]+)\)")
+    # information.
    markdown_files = link_pattern.findall(req.text)
    # For each of those markdown files, extract the id, region, trigger type
    # and parameter information.
    managed_rules = {"ManagedRules": {}}
-    for markdown_file in markdown_files:
+    for link in links:
        if args.verbose:
-            print(f"Downloading {markdown_file} ...")
+            print(f"Extracting from {link} ...")
-        req = requests.get(AWS_MARKDOWN_URL_START + markdown_file)
+        page = requests.get(AWS_CONFIG_MANAGED_RULES_URL_START + link)
-        rules = extract_managed_rule_info(req.text.split("\n"))
+        rules = extract_managed_rule_info(html.fromstring(page.content))
        rule_id = rules.pop("Identifier")
        managed_rules["ManagedRules"][rule_id] = rules