Update script that creates managed rule list used by Config (#6823)
This commit is contained in:
		
							parent
							
								
									5f1fb65038
								
							
						
					
					
						commit
						179a228741
					
				
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -1,19 +1,22 @@ | |||||||
| #!/usr/bin/env python | #!/usr/bin/env python | ||||||
| """Download markdown files with AWS managed ConfigRule info and convert to JSON. | """Scrape web-based docs for AWS managed ConfigRule info and convert to JSON. | ||||||
| 
 | 
 | ||||||
| Invocation:  ./pull_down_aws_managed_rules.py | Invocation:  ./pull_down_aws_managed_rules.py | ||||||
|  |     - Install ../requirements-tests.txt packages to ensure the lxml package | ||||||
|  |       is installed. | ||||||
|     - Execute from the moto/scripts directory. |     - Execute from the moto/scripts directory. | ||||||
|     - To track download progress, use the "-v" command line switch. |     - To track progress, use the "-v" command line switch. | ||||||
|     - MANAGED_RULES_OUTPUT_FILENAME is the variable containing the name of |     - MANAGED_RULES_OUTPUT_FILENAME is the variable with the output filename. | ||||||
|       the file that will be overwritten when this script is run. |       The file is overwritten when this script is successfully run. | ||||||
| 
 | 
 | ||||||
|     NOTE:  This script takes a while to download all the files. |     NOTE:  This script takes a while to scrape all the web pages.  The | ||||||
|  |     scraping could be parallelized, but since this script might only be | ||||||
|  |     run once every couple of months, it wasn't worth the complexity. | ||||||
| 
 | 
 | ||||||
| Summary: | Summary: | ||||||
|     The first markdown file is read to obtain the names of markdown files |     An initial web page is parsed to obtain the links for all the other | ||||||
|     for all the AWS managed config rules.  Then each of those markdown files |     docs for AWS managed config rules.  Each of those links are parsed | ||||||
|     are read and info is extracted with the final results written to a JSON |     and the needed info is written to a JSON file. | ||||||
|     file. |  | ||||||
| 
 | 
 | ||||||
|     The JSON output will look as follows: |     The JSON output will look as follows: | ||||||
| 
 | 
 | ||||||
| @ -31,6 +34,7 @@ Summary: | |||||||
|                             } |                             } | ||||||
|                     ], |                     ], | ||||||
|                     "Trigger type": "Periodic" |                     "Trigger type": "Periodic" | ||||||
|  |                     "Resource type:  "AWS::IAM::User" | ||||||
|                 }, |                 }, | ||||||
|             }, |             }, | ||||||
|             ... |             ... | ||||||
| @ -40,98 +44,118 @@ Summary: | |||||||
| import argparse | import argparse | ||||||
| 
 | 
 | ||||||
| import json | import json | ||||||
| import re |  | ||||||
| import sys | import sys | ||||||
| 
 | 
 | ||||||
|  | from lxml import html | ||||||
| import requests | import requests | ||||||
| 
 | 
 | ||||||
| MANAGED_RULES_OUTPUT_FILENAME = "../moto/config/resources/aws_managed_rules.json" | MANAGED_RULES_OUTPUT_FILENAME = "../moto/config/resources/aws_managed_rules.json" | ||||||
| 
 | 
 | ||||||
| AWS_MARKDOWN_URL_START = "https://raw.githubusercontent.com/awsdocs/aws-config-developer-guide/main/doc_source/" | AWS_CONFIG_MANAGED_RULES_URL_START = ( | ||||||
|  |     "https://docs.aws.amazon.com/config/latest/developerguide/" | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| LIST_OF_MARKDOWNS_URL = "managed-rules-by-aws-config.md" | LIST_OF_RULES_URL = "managed-rules-by-aws-config.html" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def extract_param_info(line): | def extract_param_info(page_content): | ||||||
|     """Return dict containing parameter info extracted from line.""" |     """Return dict containing parameter info extracted from page. | ||||||
|     # Examples of parameter definitions: |  | ||||||
|     #   maxAccessKeyAgeType: intDefault: 90 |  | ||||||
|     #   IgnorePublicAcls \(Optional\)Type: StringDefault: True |  | ||||||
|     #   MasterAccountId \(Optional\)Type: String |  | ||||||
|     #   endpointConfigurationTypesType: String |  | ||||||
| 
 | 
 | ||||||
|     values = re.split(r":\s?", line) |     The info for all (not each) parameters is contained within a "dl" tag, | ||||||
|     name = values[0] |     with "dt" tags providing the details.  A "dt" tag without a colon | ||||||
|     param_type = values[1] |     provides the parameter name and indicates that the "dt" tags that follow | ||||||
|  |     provide details for that parameter up until the next "dt" tag without a | ||||||
|  |     colon or the end of the "dl" tag. | ||||||
|  |     """ | ||||||
|  |     dl_tags = page_content.xpath('//div[@class="variablelist"]//dl') | ||||||
|  |     if len(dl_tags) > 1: | ||||||
|  |         print( | ||||||
|  |             f"ERROR: Found {len(dl_tags)} 'dl' tags for parameters; " | ||||||
|  |             "only expecting one.  Ignoring extra 'dl' tag.", | ||||||
|  |             file=sys.stderr | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|     # If there is no Optional keyword, then sometimes there |     dt_tags = dl_tags[0].xpath(".//dt") | ||||||
|     # isn't a space between the parameter name and "Type". |  | ||||||
|     name = re.sub("Type$", "", name) |  | ||||||
| 
 | 
 | ||||||
|     # Sometimes there isn't a space between the type and the |     all_params = [] | ||||||
|     # word "Default". |     param_details = {} | ||||||
|     if "Default" in param_type: |     for dt_tag in dt_tags: | ||||||
|         param_type = re.sub("Default$", "", param_type) |         text = dt_tag.text_content() | ||||||
| 
 |         if not text or text == "None": | ||||||
|     optional = False |  | ||||||
|     if "Optional" in line: |  | ||||||
|         optional = True |  | ||||||
|         # Remove "Optional" from the line. |  | ||||||
|         name = name.split()[0] |  | ||||||
| 
 |  | ||||||
|     param_info = { |  | ||||||
|         "Name": name, |  | ||||||
|         "Optional": optional, |  | ||||||
|         "Type": param_type, |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     # A default value isn't always provided. |  | ||||||
|     if len(values) > 2: |  | ||||||
|         param_info["Default"] = values[2] |  | ||||||
| 
 |  | ||||||
|     return param_info |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def extract_managed_rule_info(lines): |  | ||||||
|     """Return dict of qualifiers/rules extracted from a markdown file.""" |  | ||||||
|     rule_info = {} |  | ||||||
|     label_pattern = re.compile(r"(?:\*\*)(?P<label>[^\*].*)\:\*\*\s?(?P<value>.*)?") |  | ||||||
| 
 |  | ||||||
|     collecting_params = False |  | ||||||
|     params = [] |  | ||||||
|     for line in lines: |  | ||||||
|         if not line: |  | ||||||
|             continue |  | ||||||
|         line = line.replace("\\", "").strip() |  | ||||||
| 
 |  | ||||||
|         # Parameters are listed in the lines following the label, so they |  | ||||||
|         # require special processing. |  | ||||||
|         if collecting_params: |  | ||||||
|             # A new header marks the end of the parameters. |  | ||||||
|             if line.startswith("##"): |  | ||||||
|                 rule_info["Parameters"] = params |  | ||||||
|                 break |  | ||||||
| 
 |  | ||||||
|             if "Type: " in line: |  | ||||||
|                 params.append(extract_param_info(line)) |  | ||||||
|             continue |             continue | ||||||
| 
 | 
 | ||||||
|         # Check for a label starting with two asterisks. |         # If a colon is NOT present, this is the parameter name and not | ||||||
|         matches = re.match(label_pattern, line) |         # a key, value pair. | ||||||
|         if not matches: |         if ": " not in text: | ||||||
|             continue |             # If parameter info has been collected, save it and start a | ||||||
| 
 |             # collection for this new parameter. | ||||||
|         # Look for "Identifier", "Trigger type", "AWS Region" and |             if param_details: | ||||||
|         # "Parameters" labels and store the values for all but parameters. |                 all_params.append(param_details) | ||||||
|         # Parameters values aren't on the same line as labels. |                 param_details = {} | ||||||
|         label = matches.group("label") |             if "Optional" in text: | ||||||
|         value = matches.group("value") |                 text = text.split()[0] | ||||||
|         if label in ["Identifier", "Trigger type", "AWS Region"]: |                 param_details["Optional"] = True | ||||||
|             rule_info[label] = value |  | ||||||
|         elif label == "Parameters": |  | ||||||
|             collecting_params = True |  | ||||||
|             else: |             else: | ||||||
|             print(f"ERROR:  Unknown label: '{label}', line: '{line}'", file=sys.stderr) |                 param_details["Optional"] = False | ||||||
|  |             param_details["Name"] = text | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         key, value = text.split(": ") | ||||||
|  |         param_details[key] = value | ||||||
|  | 
 | ||||||
|  |     # Collect the last parameter found. | ||||||
|  |     if param_details: | ||||||
|  |         all_params.append(param_details) | ||||||
|  | 
 | ||||||
|  |     return all_params | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def extract_managed_rule_info(page_content): | ||||||
|  |     """Return dict of qualifiers/rules extracted from web page. | ||||||
|  | 
 | ||||||
|  |     An example of the html that's being processed: | ||||||
|  | 
 | ||||||
|  |     <div id="main-content" class="awsui-util-container"> | ||||||
|  |     ... | ||||||
|  | 
 | ||||||
|  |     <h1 class="topictitle" id="access-keys-rotated">access-keys-rotated</h1> | ||||||
|  |     <p><b>Identifier:</b> ACCESS_KEYS_ROTATED</p> | ||||||
|  |     <p><b>Resource Types:</b> AWS::IAM::User</p> | ||||||
|  |     <p><b>Trigger type:</b> Periodic</p> | ||||||
|  |     <p><b>AWS Region:</b> All supported AWS regions except Middle East (UAE), | ||||||
|  |         Asia Pacific (Hyderabad), Asia Pacific (Melbourne), Israel (Tel Aviv), | ||||||
|  |         Europe (Spain), Europe (Zurich) Region</p> | ||||||
|  |     <p><b>Parameters:</b></p> | ||||||
|  |     <div class="variablelist"> | ||||||
|  |     <dl> | ||||||
|  |         <dt><span class="term">maxAccessKeyAge</span></dt> | ||||||
|  |         <dt><span class="term">Type: int</span></dt> | ||||||
|  |         <dt><span class="term">Default: 90</span></dt> | ||||||
|  |           <dd> | ||||||
|  |              <p>Maximum number of days without rotation. Default 90.</p> | ||||||
|  |           </dd> | ||||||
|  |       </dl> | ||||||
|  | 
 | ||||||
|  |     ... | ||||||
|  |     </div> | ||||||
|  |     """ | ||||||
|  |     rule_info = {} | ||||||
|  |     paragraphs = page_content.xpath('//div[@id="main-content"]/descendant::p') | ||||||
|  | 
 | ||||||
|  |     for paragraph in paragraphs: | ||||||
|  |         text = paragraph.text_content() | ||||||
|  |         if ": " not in text: | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         parts = text.split(": ") | ||||||
|  |         if len(parts) > 2: | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         if parts[0] in ["Identifier", "Trigger type", "AWS Region", "Resource Types"]: | ||||||
|  |             rule_info[parts[0]] = parts[1] | ||||||
|  | 
 | ||||||
|  |     # The parameters are in their own "div", so handle them separately. | ||||||
|  |     rule_info["Parameters"] = extract_param_info(page_content) | ||||||
|     return rule_info |     return rule_info | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -139,35 +163,33 @@ def process_cmdline_args(): | |||||||
|     """Return parsed command line arguments.""" |     """Return parsed command line arguments.""" | ||||||
|     parser = argparse.ArgumentParser( |     parser = argparse.ArgumentParser( | ||||||
|         description=( |         description=( | ||||||
|             f"Download AWS config rules and merge output to create the " |             "Scrape web pages with AWS config rules and merge results to " | ||||||
|             f"JSON file {MANAGED_RULES_OUTPUT_FILENAME}" |             f"create the JSON file {MANAGED_RULES_OUTPUT_FILENAME}" | ||||||
|         ) |         ) | ||||||
|     ) |     ) | ||||||
|     parser.add_argument( |     parser.add_argument( | ||||||
|         "-v", "--verbose", action="store_true", help="Report on progress of downloads" |         "-v", "--verbose", action="store_true", help="Report on progress" | ||||||
|     ) |     ) | ||||||
|     return parser.parse_args() |     return parser.parse_args() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def main(): | def main(): | ||||||
|     """Create a JSON file containing info pulled from AWS markdown files.""" |     """Create a JSON file containing info pulled from AWS online docs.""" | ||||||
|     args = process_cmdline_args() |     args = process_cmdline_args() | ||||||
| 
 | 
 | ||||||
|     # Get the markdown file with links to the markdown files for services. |     # Get the list of links for all the services. | ||||||
|     req = requests.get(AWS_MARKDOWN_URL_START + LIST_OF_MARKDOWNS_URL) |     page = requests.get(AWS_CONFIG_MANAGED_RULES_URL_START + LIST_OF_RULES_URL) | ||||||
|  |     tree = html.fromstring(page.content) | ||||||
|  |     links = [x.lstrip("./") for x in tree.xpath('//div[@class="highlights"]//ul//a/@href')] | ||||||
| 
 | 
 | ||||||
|     # Extract the list of all the markdown files on the page. |     # From each linked page, extract the id, region, trigger type and parameter | ||||||
|     link_pattern = re.compile(r"\+ \[[^\]]+\]\(([^)]+)\)") |     # information. | ||||||
|     markdown_files = link_pattern.findall(req.text) |  | ||||||
| 
 |  | ||||||
|     # For each of those markdown files, extract the id, region, trigger type |  | ||||||
|     # and parameter information. |  | ||||||
|     managed_rules = {"ManagedRules": {}} |     managed_rules = {"ManagedRules": {}} | ||||||
|     for markdown_file in markdown_files: |     for link in links: | ||||||
|         if args.verbose: |         if args.verbose: | ||||||
|             print(f"Downloading {markdown_file} ...") |             print(f"Extracting from {link} ...") | ||||||
|         req = requests.get(AWS_MARKDOWN_URL_START + markdown_file) |         page = requests.get(AWS_CONFIG_MANAGED_RULES_URL_START + link) | ||||||
|         rules = extract_managed_rule_info(req.text.split("\n")) |         rules = extract_managed_rule_info(html.fromstring(page.content)) | ||||||
| 
 | 
 | ||||||
|         rule_id = rules.pop("Identifier") |         rule_id = rules.pop("Identifier") | ||||||
|         managed_rules["ManagedRules"][rule_id] = rules |         managed_rules["ManagedRules"][rule_id] = rules | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user