Update script that creates managed rule list used by Config (#6823)

2023-09-20 03:30:58 -04:00 · 2023-09-20 03:30:58 -04:00 · 179a228741
commit 179a228741
parent 5f1fb65038
2 changed files with 1106 additions and 378 deletions
--- a/moto/config/resources/aws_managed_rules.json
+++ b/moto/config/resources/aws_managed_rules.json
--- a/scripts/pull_down_aws_managed_rules.py
+++ b/scripts/pull_down_aws_managed_rules.py
@ -1,19 +1,22 @@
 #!/usr/bin/env python
-"""Download markdown files with AWS managed ConfigRule info and convert to JSON.
+"""Scrape web-based docs for AWS managed ConfigRule info and convert to JSON.

 Invocation:  ./pull_down_aws_managed_rules.py
+    - Install ../requirements-tests.txt packages to ensure the lxml package
+      is installed.
    - Execute from the moto/scripts directory.
-    - To track download progress, use the "-v" command line switch.
-    - MANAGED_RULES_OUTPUT_FILENAME is the variable containing the name of
-      the file that will be overwritten when this script is run.
+    - To track progress, use the "-v" command line switch.
+    - MANAGED_RULES_OUTPUT_FILENAME is the variable with the output filename.
+      The file is overwritten when this script is successfully run.

-    NOTE:  This script takes a while to download all the files.
+    NOTE:  This script takes a while to scrape all the web pages.  The
+    scraping could be parallelized, but since this script might only be
+    run once every couple of months, it wasn't worth the complexity.

 Summary:
-    The first markdown file is read to obtain the names of markdown files
-    for all the AWS managed config rules.  Then each of those markdown files
-    are read and info is extracted with the final results written to a JSON
-    file.
+    An initial web page is parsed to obtain the links for all the other
+    docs for AWS managed config rules.  Each of those links are parsed
+    and the needed info is written to a JSON file.

    The JSON output will look as follows:

@ -31,6 +34,7 @@ Summary:
                            }
                    ],
                    "Trigger type": "Periodic"
+                    "Resource type:  "AWS::IAM::User"
                },
            },
            ...
@ -40,98 +44,118 @@ Summary:
 import argparse

 import json
-import re
 import sys

+from lxml import html
 import requests

 MANAGED_RULES_OUTPUT_FILENAME = "../moto/config/resources/aws_managed_rules.json"

-AWS_MARKDOWN_URL_START = "https://raw.githubusercontent.com/awsdocs/aws-config-developer-guide/main/doc_source/"
+AWS_CONFIG_MANAGED_RULES_URL_START = (
+    "https://docs.aws.amazon.com/config/latest/developerguide/"
+)

-LIST_OF_MARKDOWNS_URL = "managed-rules-by-aws-config.md"
+LIST_OF_RULES_URL = "managed-rules-by-aws-config.html"


-def extract_param_info(line):
-    """Return dict containing parameter info extracted from line."""
-    # Examples of parameter definitions:
-    #   maxAccessKeyAgeType: intDefault: 90
-    #   IgnorePublicAcls \(Optional\)Type: StringDefault: True
-    #   MasterAccountId \(Optional\)Type: String
-    #   endpointConfigurationTypesType: String
+def extract_param_info(page_content):
+    """Return dict containing parameter info extracted from page.

-    values = re.split(r":\s?", line)
-    name = values[0]
-    param_type = values[1]
+    The info for all (not each) parameters is contained within a "dl" tag,
+    with "dt" tags providing the details.  A "dt" tag without a colon
+    provides the parameter name and indicates that the "dt" tags that follow
+    provide details for that parameter up until the next "dt" tag without a
+    colon or the end of the "dl" tag.
+    """
+    dl_tags = page_content.xpath('//div[@class="variablelist"]//dl')
+    if len(dl_tags) > 1:
+        print(
+            f"ERROR: Found {len(dl_tags)} 'dl' tags for parameters; "
+            "only expecting one.  Ignoring extra 'dl' tag.",
+            file=sys.stderr
+        )

-    # If there is no Optional keyword, then sometimes there
-    # isn't a space between the parameter name and "Type".
-    name = re.sub("Type$", "", name)
+    dt_tags = dl_tags[0].xpath(".//dt")

-    # Sometimes there isn't a space between the type and the
-    # word "Default".
-    if "Default" in param_type:
-        param_type = re.sub("Default$", "", param_type)
-
-    optional = False
-    if "Optional" in line:
-        optional = True
-        # Remove "Optional" from the line.
-        name = name.split()[0]
-
-    param_info = {
-        "Name": name,
-        "Optional": optional,
-        "Type": param_type,
-    }
-
-    # A default value isn't always provided.
-    if len(values) > 2:
-        param_info["Default"] = values[2]
-
-    return param_info
-
-
-def extract_managed_rule_info(lines):
-    """Return dict of qualifiers/rules extracted from a markdown file."""
-    rule_info = {}
-    label_pattern = re.compile(r"(?:\*\*)(?P<label>[^\*].*)\:\*\*\s?(?P<value>.*)?")
-
-    collecting_params = False
-    params = []
-    for line in lines:
-        if not line:
-            continue
-        line = line.replace("\\", "").strip()
-
-        # Parameters are listed in the lines following the label, so they
-        # require special processing.
-        if collecting_params:
-            # A new header marks the end of the parameters.
-            if line.startswith("##"):
-                rule_info["Parameters"] = params
-                break
-
-            if "Type: " in line:
-                params.append(extract_param_info(line))
+    all_params = []
+    param_details = {}
+    for dt_tag in dt_tags:
+        text = dt_tag.text_content()
+        if not text or text == "None":
            continue

-        # Check for a label starting with two asterisks.
-        matches = re.match(label_pattern, line)
-        if not matches:
-            continue
-
-        # Look for "Identifier", "Trigger type", "AWS Region" and
-        # "Parameters" labels and store the values for all but parameters.
-        # Parameters values aren't on the same line as labels.
-        label = matches.group("label")
-        value = matches.group("value")
-        if label in ["Identifier", "Trigger type", "AWS Region"]:
-            rule_info[label] = value
-        elif label == "Parameters":
-            collecting_params = True
+        # If a colon is NOT present, this is the parameter name and not
+        # a key, value pair.
+        if ": " not in text:
+            # If parameter info has been collected, save it and start a
+            # collection for this new parameter.
+            if param_details:
+                all_params.append(param_details)
+                param_details = {}
+            if "Optional" in text:
+                text = text.split()[0]
+                param_details["Optional"] = True
            else:
-            print(f"ERROR:  Unknown label: '{label}', line: '{line}'", file=sys.stderr)
+                param_details["Optional"] = False
+            param_details["Name"] = text
+            continue
+
+        key, value = text.split(": ")
+        param_details[key] = value
+
+    # Collect the last parameter found.
+    if param_details:
+        all_params.append(param_details)
+
+    return all_params
+
+
+def extract_managed_rule_info(page_content):
+    """Return dict of qualifiers/rules extracted from web page.
+
+    An example of the html that's being processed:
+
+    <div id="main-content" class="awsui-util-container">
+    ...
+
+    <h1 class="topictitle" id="access-keys-rotated">access-keys-rotated</h1>
+    <p><b>Identifier:</b> ACCESS_KEYS_ROTATED</p>
+    <p><b>Resource Types:</b> AWS::IAM::User</p>
+    <p><b>Trigger type:</b> Periodic</p>
+    <p><b>AWS Region:</b> All supported AWS regions except Middle East (UAE),
+        Asia Pacific (Hyderabad), Asia Pacific (Melbourne), Israel (Tel Aviv),
+        Europe (Spain), Europe (Zurich) Region</p>
+    <p><b>Parameters:</b></p>
+    <div class="variablelist">
+    <dl>
+        <dt><span class="term">maxAccessKeyAge</span></dt>
+        <dt><span class="term">Type: int</span></dt>
+        <dt><span class="term">Default: 90</span></dt>
+          <dd>
+             <p>Maximum number of days without rotation. Default 90.</p>
+          </dd>
+      </dl>
+
+    ...
+    </div>
+    """
+    rule_info = {}
+    paragraphs = page_content.xpath('//div[@id="main-content"]/descendant::p')
+
+    for paragraph in paragraphs:
+        text = paragraph.text_content()
+        if ": " not in text:
+            continue
+
+        parts = text.split(": ")
+        if len(parts) > 2:
+            continue
+
+        if parts[0] in ["Identifier", "Trigger type", "AWS Region", "Resource Types"]:
+            rule_info[parts[0]] = parts[1]
+
+    # The parameters are in their own "div", so handle them separately.
+    rule_info["Parameters"] = extract_param_info(page_content)
    return rule_info


@ -139,35 +163,33 @@ def process_cmdline_args():
    """Return parsed command line arguments."""
    parser = argparse.ArgumentParser(
        description=(
-            f"Download AWS config rules and merge output to create the "
-            f"JSON file {MANAGED_RULES_OUTPUT_FILENAME}"
+            "Scrape web pages with AWS config rules and merge results to "
+            f"create the JSON file {MANAGED_RULES_OUTPUT_FILENAME}"
        )
    )
    parser.add_argument(
-        "-v", "--verbose", action="store_true", help="Report on progress of downloads"
+        "-v", "--verbose", action="store_true", help="Report on progress"
    )
    return parser.parse_args()


 def main():
-    """Create a JSON file containing info pulled from AWS markdown files."""
+    """Create a JSON file containing info pulled from AWS online docs."""
    args = process_cmdline_args()

-    # Get the markdown file with links to the markdown files for services.
-    req = requests.get(AWS_MARKDOWN_URL_START + LIST_OF_MARKDOWNS_URL)
+    # Get the list of links for all the services.
+    page = requests.get(AWS_CONFIG_MANAGED_RULES_URL_START + LIST_OF_RULES_URL)
+    tree = html.fromstring(page.content)
+    links = [x.lstrip("./") for x in tree.xpath('//div[@class="highlights"]//ul//a/@href')]

-    # Extract the list of all the markdown files on the page.
-    link_pattern = re.compile(r"\+ \[[^\]]+\]\(([^)]+)\)")
-    markdown_files = link_pattern.findall(req.text)
-
-    # For each of those markdown files, extract the id, region, trigger type
-    # and parameter information.
+    # From each linked page, extract the id, region, trigger type and parameter
+    # information.
    managed_rules = {"ManagedRules": {}}
-    for markdown_file in markdown_files:
+    for link in links:
        if args.verbose:
-            print(f"Downloading {markdown_file} ...")
-        req = requests.get(AWS_MARKDOWN_URL_START + markdown_file)
-        rules = extract_managed_rule_info(req.text.split("\n"))
+            print(f"Extracting from {link} ...")
+        page = requests.get(AWS_CONFIG_MANAGED_RULES_URL_START + link)
+        rules = extract_managed_rule_info(html.fromstring(page.content))

        rule_id = rules.pop("Identifier")
        managed_rules["ManagedRules"][rule_id] = rules