diff --git a/checkov/kustomize/runner.py b/checkov/kustomize/runner.py index e4cf99c742..407ab91c6d 100644 --- a/checkov/kustomize/runner.py +++ b/checkov/kustomize/runner.py @@ -13,6 +13,8 @@ import yaml from typing import Optional, Dict, Any, TextIO, TYPE_CHECKING +from checkov.common.parallelizer.parallel_runner import parallel_runner + from checkov.common.graph.graph_builder import CustomAttributes from checkov.common.graph.graph_builder.consts import GraphSource @@ -702,23 +704,17 @@ def run_kustomize_to_k8s( shared_kustomize_file_mappings = pickle_deepcopy(manager.dict()) # type:ignore[arg-type] # works with DictProxy shared_kustomize_file_mappings.clear() - jobs = [] - for filePath in self.kustomizeProcessedFolderAndMeta: - p = multiprocessing.Process( - target=self._run_kustomize_parser, - args=( - filePath, - shared_kustomize_file_mappings, - self.kustomizeProcessedFolderAndMeta, - self.templateRendererCommand, - self.target_folder_path - ) + items = [ + ( + filePath, + shared_kustomize_file_mappings, + self.kustomizeProcessedFolderAndMeta, + self.templateRendererCommand, + self.target_folder_path, ) - jobs.append(p) - p.start() - - for proc in jobs: - proc.join() + for filePath in self.kustomizeProcessedFolderAndMeta + ] + list(parallel_runner.run_function(self._run_kustomize_parser, items)) self.kustomizeFileMappings = dict(shared_kustomize_file_mappings) diff --git a/checkov/terraform/module_loading/module_finder.py b/checkov/terraform/module_loading/module_finder.py index c4d7fb9839..25ee12a81c 100644 --- a/checkov/terraform/module_loading/module_finder.py +++ b/checkov/terraform/module_loading/module_finder.py @@ -5,20 +5,16 @@ import os import re from pathlib import Path -from typing import List, Callable, TYPE_CHECKING +from typing import List, Callable, TYPE_CHECKING, Any, Optional, Dict from checkov.common.util.env_vars_config import env_vars_config from checkov.common.parallelizer.parallel_runner import parallel_runner -from checkov.common.util.file_utils import read_file_with_any_encoding from checkov.terraform.module_loading.registry import module_loader_registry +from checkov.terraform.parser_utils import load_or_die_quietly if TYPE_CHECKING: from checkov.terraform.module_loading.registry import ModuleLoaderRegistry -MODULE_NAME_PATTERN = re.compile(r'[^#]*\bmodule\s*"(?P.*)"') -MODULE_SOURCE_PATTERN = re.compile(r'[^#]*\bsource\s*=\s*"(?P.*)"') -MODULE_VERSION_PATTERN = re.compile(r'[^#]*\bversion\s*=\s*"(?P=|!=|>=|>|<=|<|~>\s*)?(?P[\d.]+-?\w*)"') - class ModuleDownload: def __init__(self, source_dir: str) -> None: @@ -56,69 +52,46 @@ def find_tf_managed_modules(path: str) -> List[ModuleDownload]: return modules_found -def find_modules(path: str) -> List[ModuleDownload]: +def find_modules(path: str, loaded_files_cache: Optional[Dict[str, Any]] = None, + parsing_errors: Optional[Dict[str, Exception]] = None, excluded_paths: Optional[list[str]] = None) -> list[ModuleDownload]: modules_found: list[ModuleDownload] = [] + if loaded_files_cache is None: + loaded_files_cache = {} + if parsing_errors is None: + parsing_errors = {} + excluded_paths_regex = re.compile('|'.join(f"({excluded_paths})")) if excluded_paths else None for root, _, full_file_names in os.walk(path): for file_name in full_file_names: - if not file_name.endswith('.tf'): + if not file_name.endswith(".tf"): continue if root.startswith(os.path.join(path, ".terraform", "modules")): # don't scan the modules folder used by Terraform continue + file_path = os.path.join(root, file_name) + if excluded_paths_regex and excluded_paths_regex.search(file_path): + continue - try: - content = read_file_with_any_encoding(file_path=os.path.join(path, root, file_name)) - if "module " not in content: - # if there is no "module " ref in the whole file, then no need to search line by line - continue - - curr_md = None - comment_out = re.findall(r'/\*.*?\*/', content, re.DOTALL) - for line in content.splitlines(): - if not curr_md: - if line.startswith('module'): - in_comment_out = [line for a in comment_out if line in a] - if in_comment_out: - # if the "module " ref in the comment out part - continue - curr_md = ModuleDownload(os.path.dirname(os.path.join(root, file_name))) - - # also extract the name for easier mapping against the TF modules.json file - match = re.match(MODULE_NAME_PATTERN, line) - if match: - curr_md.module_name = match.group("name") - - continue - else: - if line.startswith('}'): - if curr_md.module_link is None: - logging.warning(f'A module at {curr_md.source_dir} had no source, skipping') - else: - curr_md.address = f"{curr_md.module_link}:{curr_md.version}" - modules_found.append(curr_md) - curr_md = None - continue - - if "source" in line: - match = re.match(MODULE_SOURCE_PATTERN, line) - if match: - curr_md.module_link = match.group('link') - continue - - if "version" in line: - match = re.match(MODULE_VERSION_PATTERN, line) - if match: - curr_md.version = f"{match.group('operator')}{match.group('version')}" if match.group('operator') else match.group('version') - except (UnicodeDecodeError, FileNotFoundError) as e: - logging.warning(f"Skipping {os.path.join(path, root, file_name)} because of {e}") + data = load_or_die_quietly(file_path, parsing_errors) + if not data: continue + loaded_files_cache[file_path] = data + if "module" not in data: + continue + for module in data["module"]: + for module_name, module_data in module.items(): + md = ModuleDownload(os.path.dirname(file_path)) + md.module_name = module_name + md.module_link = module_data.get("source", [None])[0] + md.version = module_data.get("version", [None])[0] + if md.module_link: + md.address = f"{md.module_link}:{md.version}" if md.version else md.module_link + modules_found.append(md) return modules_found def should_download(path: str | None) -> bool: - return path is not None and not (path.startswith('./') or path.startswith('../') or path.startswith('/')) @@ -127,13 +100,16 @@ def load_tf_modules( should_download_module: Callable[[str | None], bool] = should_download, run_parallel: bool = False, modules_to_load: List[ModuleDownload] | None = None, - stop_on_failure: bool = False + stop_on_failure: bool = False, + loaded_files_cache: dict[str, Any] | None = None, + parsing_errors: dict[str, Exception] | None = None, + excluded_paths: List[str] | None = None, ) -> None: module_loader_registry.root_dir = path if not modules_to_load and env_vars_config.CHECKOV_EXPERIMENTAL_TERRAFORM_MANAGED_MODULES: modules_to_load = find_tf_managed_modules(path) if not modules_to_load: - modules_to_load = find_modules(path) + modules_to_load = find_modules(path, loaded_files_cache=loaded_files_cache, parsing_errors=parsing_errors, excluded_paths=excluded_paths) # To avoid duplicate work, we need to get the distinct module sources distinct_modules = list({m.address: m for m in modules_to_load}.values()) diff --git a/checkov/terraform/parser_utils.py b/checkov/terraform/parser_utils.py new file mode 100644 index 0000000000..99940d1480 --- /dev/null +++ b/checkov/terraform/parser_utils.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +import json +import logging +import os +import platform +import threading +from pathlib import Path +from typing import Any, cast, Optional, TextIO, Type + +import hcl2 + +from checkov.common.util.env_vars_config import env_vars_config +from checkov.common.util.stopit import ThreadingTimeout, SignalTimeout +from checkov.common.util.stopit.utils import BaseTimeout +from checkov.terraform import validate_malformed_definitions, clean_bad_definitions +from checkov.terraform.modules.module_utils import _Hcl2Payload + + +def load_or_die_quietly( + file: str | Path | os.DirEntry[str], parsing_errors: dict[str, Exception], clean_definitions: bool = True +) -> Optional[_Hcl2Payload]: + """ + Load JSON or HCL, depending on filename. + :return: None if the file can't be loaded + """ + file_path = os.fspath(file) + file_name = os.path.basename(file_path) + + if file_name.endswith(".tfvars"): + clean_definitions = False + + try: + logging.debug(f"Parsing {file_path}") + + with open(file_path, "r", encoding="utf-8-sig") as f: + if file_name.endswith(".json"): + return cast("_Hcl2Payload", json.load(f)) + else: + raw_data = __parse_with_timeout(f) + non_malformed_definitions = validate_malformed_definitions(raw_data) + if clean_definitions: + return clean_bad_definitions(non_malformed_definitions) + else: + return non_malformed_definitions + except Exception as e: + logging.debug(f"failed while parsing file {file_path}", exc_info=True) + parsing_errors[file_path] = e + return None + + +# if we are not running in a thread, run the hcl2.load function with a timeout, to prevent from getting stuck in parsing. +def __parse_with_timeout(f: TextIO) -> dict[str, list[dict[str, Any]]]: + # setting up timeout class + timeout_class: Optional[Type[BaseTimeout]] = None + if platform.system() == "Windows": + timeout_class = ThreadingTimeout + elif threading.current_thread() is threading.main_thread(): + timeout_class = SignalTimeout + + # if we're not running on the main thread, don't use timeout + parsing_timeout = env_vars_config.HCL_PARSE_TIMEOUT_SEC or 0 + if not timeout_class or not parsing_timeout: + return hcl2.load(f) + + with timeout_class(parsing_timeout) as to_ctx_mgr: + raw_data = hcl2.load(f) + if to_ctx_mgr.state == to_ctx_mgr.TIMED_OUT: + logging.debug(f"reached timeout when parsing file {f} using hcl2") + raise Exception(f"file took more than {parsing_timeout} seconds to parse") + return raw_data diff --git a/checkov/terraform/tf_parser.py b/checkov/terraform/tf_parser.py index f57528024f..28d441a7e1 100644 --- a/checkov/terraform/tf_parser.py +++ b/checkov/terraform/tf_parser.py @@ -1,38 +1,29 @@ from __future__ import annotations -import json import logging import os -import platform -import threading from collections import defaultdict -from pathlib import Path -from typing import Optional, Dict, Mapping, Set, Tuple, Callable, Any, List, cast, TYPE_CHECKING, overload, TextIO, Type - -import hcl2 +from typing import Optional, Dict, Mapping, Set, Tuple, Callable, Any, List, cast, TYPE_CHECKING, overload from checkov.common.parallelizer.parallel_runner import parallel_runner from checkov.common.runners.base_runner import filter_ignored_paths, IGNORE_HIDDEN_DIRECTORY_ENV from checkov.common.util.consts import DEFAULT_EXTERNAL_MODULES_DIR, RESOLVED_MODULE_ENTRY_NAME from checkov.common.util.data_structures_utils import pickle_deepcopy from checkov.common.util.deep_merge import pickle_deep_merge -from checkov.common.util.env_vars_config import env_vars_config -from checkov.common.util.stopit import ThreadingTimeout, SignalTimeout -from checkov.common.util.stopit.utils import BaseTimeout from checkov.common.util.type_forcers import force_list from checkov.common.variables.context import EvaluationContext -from checkov.terraform import validate_malformed_definitions, clean_bad_definitions from checkov.terraform.graph_builder.graph_components.block_types import BlockType from checkov.terraform.graph_builder.graph_components.module import Module from checkov.terraform.module_loading.content import ModuleContent -from checkov.terraform.module_loading.module_finder import load_tf_modules from checkov.terraform.module_loading.registry import module_loader_registry as default_ml_registry, \ ModuleLoaderRegistry +from checkov.terraform.module_loading.module_finder import load_tf_modules from checkov.common.util.parser_utils import is_acceptable_module_param from checkov.terraform.modules.module_utils import safe_index, \ - remove_module_dependency_from_path, \ - clean_parser_types, serialize_definitions, _Hcl2Payload + remove_module_dependency_from_path, clean_parser_types, serialize_definitions from checkov.terraform.modules.module_objects import TFModule, TFDefinitionKey +from checkov.terraform.parser_utils import load_or_die_quietly + if TYPE_CHECKING: from typing_extensions import TypeGuard @@ -105,7 +96,7 @@ def parse_directory( default_ml_registry.download_external_modules = download_external_modules default_ml_registry.external_modules_folder_name = external_modules_download_path default_ml_registry.module_content_cache = external_modules_content_cache if external_modules_content_cache else {} - load_tf_modules(directory) + load_tf_modules(directory, loaded_files_cache=self.loaded_files_map, parsing_errors=self.out_parsing_errors, excluded_paths=self.excluded_paths) self._parse_directory(dir_filter=lambda d: self._check_process_dir(d), vars_files=vars_files) self._update_resolved_modules() return self.out_definitions @@ -699,57 +690,3 @@ def get_tf_definition_object_from_module_dependency( return TFDefinitionKey(path.file_path, TFModule(path=module_dependency.file_path, name=module_dependency_name)) return TFDefinitionKey(path.file_path, TFModule(path=module_dependency.file_path, name=module_dependency_name, nested_tf_module=module_dependency.tf_source_modules)) - - -def load_or_die_quietly( - file: str | Path | os.DirEntry[str], parsing_errors: dict[str, Exception], clean_definitions: bool = True -) -> Optional[_Hcl2Payload]: - """ - Load JSON or HCL, depending on filename. - :return: None if the file can't be loaded - """ - file_path = os.fspath(file) - file_name = os.path.basename(file_path) - - if file_name.endswith('.tfvars'): - clean_definitions = False - - try: - logging.debug(f"Parsing {file_path}") - - with open(file_path, "r", encoding="utf-8-sig") as f: - if file_name.endswith(".json"): - return cast("_Hcl2Payload", json.load(f)) - else: - raw_data = __parse_with_timeout(f) - non_malformed_definitions = validate_malformed_definitions(raw_data) - if clean_definitions: - return clean_bad_definitions(non_malformed_definitions) - else: - return non_malformed_definitions - except Exception as e: - logging.debug(f'failed while parsing file {file_path}', exc_info=True) - parsing_errors[file_path] = e - return None - - -# if we are not running in a thread, run the hcl2.load function with a timeout, to prevent from getting stuck in parsing. -def __parse_with_timeout(f: TextIO) -> dict[str, list[dict[str, Any]]]: - # setting up timeout class - timeout_class: Optional[Type[BaseTimeout]] = None - if platform.system() == 'Windows': - timeout_class = ThreadingTimeout - elif threading.current_thread() is threading.main_thread(): - timeout_class = SignalTimeout - - # if we're not running on the main thread, don't use timeout - parsing_timeout = env_vars_config.HCL_PARSE_TIMEOUT_SEC or 0 - if not timeout_class or not parsing_timeout: - return hcl2.load(f) - - with timeout_class(parsing_timeout) as to_ctx_mgr: - raw_data = hcl2.load(f) - if to_ctx_mgr.state == to_ctx_mgr.TIMED_OUT: - logging.debug(f"reached timeout when parsing file {f} using hcl2") - raise Exception(f"file took more than {parsing_timeout} seconds to parse") - return raw_data diff --git a/tests/kustomize/test_runner_image_referencer.py b/tests/kustomize/test_runner_image_referencer.py index 52d32695d1..93c09a3b18 100644 --- a/tests/kustomize/test_runner_image_referencer.py +++ b/tests/kustomize/test_runner_image_referencer.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import sys from pathlib import Path from unittest import mock @@ -21,6 +22,7 @@ @pytest.mark.xfail(reason="This is probably connected to the OS + kustomize version") +@pytest.mark.skipif((3, 9) <= sys.version_info < (3, 11), reason="fails on python 3.9 and 3.10 due to path.resolve issues.") @pytest.mark.skipif(os.name == "nt" or not kustomize_exists(), reason="kustomize not installed or Windows OS") @pytest.mark.parametrize("allow_kustomize_file_edits, code_lines", [ (True, "18-34"), diff --git a/tests/terraform/module_loading/data/nested_modules/main.tf b/tests/terraform/module_loading/data/nested_modules/main.tf new file mode 100644 index 0000000000..0fa51de130 --- /dev/null +++ b/tests/terraform/module_loading/data/nested_modules/main.tf @@ -0,0 +1,14 @@ +module "example_vm" { + source = "terraform-aws-modules/vpc/aws" + version = "3.14.0" + + name = "my-vpc" + cidr = "10.0.0.0/16" + + source_image_reference = { + publisher = "Canonical" + offer = "UbuntuServer" + sku = "18.04-LTS" + version = "3.15.2" + } +} diff --git a/tests/terraform/module_loading/test_tf_module_finder.py b/tests/terraform/module_loading/test_tf_module_finder.py index cf1bec9103..9bfb137ab5 100644 --- a/tests/terraform/module_loading/test_tf_module_finder.py +++ b/tests/terraform/module_loading/test_tf_module_finder.py @@ -41,6 +41,13 @@ def test_module_finder_ignore_comments(self): self.assertIn(m, ["terraform-aws-modules/s3-bucket/aws", "../../../../../../../platform/src/stacks/accountStack"]) + def test_module_finder_nested_blocks(self): + cur_dir = os.path.abspath(os.path.dirname(__file__)) + src_dir = os.path.join(cur_dir, 'data', 'nested_modules') + modules = find_modules(src_dir) + self.assertEqual(1, len(modules)) + self.assertEqual("3.14.0", modules[0].version) + def test_downloader(self): modules = find_modules(self.get_src_dir()) @@ -98,4 +105,3 @@ def test_tf_managed_submodules(): assert modules[1].address == 'somewhere/b:1' assert modules[1].module_name == 'a.b' assert modules[1].module_link == '.terraform/modules/a.b' -