Use of Ansible for all tool integration

Problem

We will be integrating EDB-Terraform https://github.com/EnterpriseDB/edb-terraform back into this tool. As part of this, we could integrate these changes in various ways:

python imports and use of pip/setup tools
use of shell/bash commands, wrapped in python, use of pip/setup tools
bash commands and pip installation with ansible playbooks

We also implement azurecli and other provider clis with shell commands wrapped with python.
For example, this snippet itself is now available to do with terraform:

postgres-deployment/edbdeploy/cloud.py

Lines 391 to 410 in c250581

    
           def accept_terms(self, publisher, offer, sku, version): 
        
               try: 
        
                   output = exec_shell([ 
        
                       self.bin("az"), 
        
                       "vm", 
        
                       "image", 
        
                       "terms", 
        
                       "accept", 
        
                       "--urn %s:%s:%s:%s" % (publisher, offer, sku, version), 
        
                   ]) 
        
                   logging.debug("Command output: %s", output.decode("utf-8")) 
        
               except CalledProcessError as e: 
        
                   logging.error("Failed to execute the command: %s", e.cmd) 
        
                   logging.error("Return code is: %s", e.returncode) 
        
                   logging.error("Output: %s", e.output) 
        
                   raise CloudCliError( 
        
                       "Failed to execute the following command, please check the " 
        
                       "logs for details: %s" % e.cmd 
        
                   )

However, the following are features not available on azure's terraform provider yet, but they will be stripped and added into edb-terraform as they are made available.

postgres-deployment/edbdeploy/cloud.py

Lines 297 to 389 in c250581

    
           def check_instance_type_availability(self, instance_type, region): 
        
               try: 
        
                   output = exec_shell([ 
        
                       self.bin("az"), 
        
                       "vm", 
        
                       "list-sizes", 
        
                       "--location %s" % region, 
        
                       "--query \"[?name == '%s']\"" % instance_type, 
        
                       "--output json" 
        
                   ]) 
        
                   result = json.loads(output.decode("utf-8")) 
        
                   logging.debug("Command output: %s", result) 
        
                   if len(result) == 0: 
        
                       raise CloudCliError( 
        
                           "Instance type %s not available in region %s" 
        
                           % (instance_type, region) 
        
                       ) 
        
               except ValueError: 
        
                   # JSON decoding error 
        
                   logging.error("Failed to decode JSON data") 
        
                   logging.error("Output: %s", output.decode("utf-8")) 
        
                   raise CloudCliError( 
        
                       "Failed to decode JSON data, please check the logs for details" 
        
                   ) 
        
               except CalledProcessError as e: 
        
                   logging.error("Failed to execute the command: %s", e.cmd) 
        
                   logging.error("Return code is: %s", e.returncode) 
        
                   logging.error("Output: %s", e.output) 
        
                   raise CloudCliError( 
        
                       "Failed to execute the following command, please check the " 
        
                       "logs for details: %s" % e.cmd 
        
                   ) 
        
           def check_image_availability(self, publisher, offer, sku, region): 
        
               try: 
        
                   output = exec_shell([ 
        
                       self.bin("az"), 
        
                       "vm", 
        
                       "image", 
        
                       "list", 
        
                       "--all", 
        
                       "-p \"%s\"" % publisher, 
        
                       "-f \"%s\"" % offer, 
        
                       "-s \"%s\"" % sku, 
        
                       "-l %s" % region, 
        
                       "--query", 
        
                       "\"[?offer == '%s' && sku =='%s']\"" % (offer, sku), 
        
                       "--output json" 
        
                   ]) 
        
                   result = json.loads(output.decode("utf-8")) 
        
                   logging.debug("Command output: %s", result) 
        
                   if len(result) == 0: 
        
                       raise CloudCliError( 
        
                           "Image %s:%s:%s not available in region %s" 
        
                           % (publisher, offer, sku, region) 
        
                       ) 
        
                   return result[0] 
        
               except ValueError: 
        
                   # JSON decoding error 
        
                   logging.error("Failed to decode JSON data") 
        
                   logging.error("Output: %s", output.decode("utf-8")) 
        
                   raise CloudCliError( 
        
                       "Failed to decode JSON data, please check the logs for details" 
        
                   ) 
        
               except CalledProcessError as e: 
        
                   logging.error("Failed to execute the command: %s", e.cmd) 
        
                   logging.error("Return code is: %s", e.returncode) 
        
                   logging.error("Output: %s", e.output) 
        
                   raise CloudCliError( 
        
                       "Failed to execute the following command, please check the " 
        
                       "logs for details: %s" % e.cmd 
        
                   ) 
        
           def check_instances_availability(self, project_name): 
        
               try: 
        
                   output = exec_shell([ 
        
                       self.bin("az"), 
        
                       "vm", 
        
                       "wait", 
        
                       "--ids", 
        
                       "$(%s vm list -g \"%s_edb_resource_group\" --query \"[].id\" -o tsv)" 
        
                       % (self.bin("az"), project_name), 
        
                       "--created" 
        
                   ]) 
        
                   logging.debug("Command output: %s", output.decode("utf-8")) 
        
               except CalledProcessError as e: 
        
                   logging.error("Failed to execute the command: %s", e.cmd) 
        
                   logging.error("Return code is: %s", e.returncode) 
        
                   logging.error("Output: %s", e.output) 
        
                   raise CloudCliError( 
        
                       "Failed to execute the following command, please check the " 
        
                       "logs for details: %s" % e.cmd 
        
                   )

Within edb-terraform itself, we also have a need to use shell scripts to format additional volumes as well as leaving port 22 open. We could expose this more directly with cloud-init or user-data features some providers allow, but this breaks the model where terraform is only for provisioning the machines, not configuration. We also need to eventually close port 22 some way when it is not needed anymore which requires directly using terraform commands.

Also, we are needing to pass variables around or manually define them as each tool needs the various input files or outputs and this could benefit our canned benchmarks. In this bash script, we need to know the ssh user and we also need variables from terraform to then pass into ansible for the playbook:
https://github.com/EnterpriseDB/edb-benchmarks/blob/c92da747fe3a1221ceb83c49bdddf9f28e2f10c9/aws-dbt2-biganimal/deploy/run.sh#L1-L17

Solution:

Handle integration of the tools by using playbooks, allowing for us to separate the different tools as modules or collections, either in this tool or as their own repo. Ansible would handle state and make it much more visible where things are changing and passed around as well as stripping/augmenting each module/collection as needed without impacting the rest of the code base. Ansible prefers python or powershell for any modules, which allows us the extensibility of a full language when needed through each module implementation. Terraform also has state that it needs managed as well as ansible relying on sensitive credentials and inventory files made up of the terraform resources so maybe ansible could handle this with terraform only creating secure buckets or other.

edb-terraform is created into an ansible module or ansible action plugin

this would allow us to use named task and directly define the variables in the playbook since they passed as yaml when passed through to the module.
we could leverage terraforms community module to handle closing the ports within a playbook and more dynamic configurations such as needing to add route53 for aws which is not handled by edb-terraform yet.
the tool would still be uploaded to pypy and allow for us to use it directly with bash if ansible use is not desired

tpaexec and all cloud provider clis would also be implemented as modules or action plugins or use of already available ansible collections.
create playbooks for each desired configuration such as EDB-RA-1 architecture and import edb-terraform, cli, tpaexec plugin modules and the edb-ansible collection and similar could be done for our canned benchmarks.
postgres-deployment would integrate the playbooks to be a more native CLI experience so users do not need understand ansible.

Here is a snippet of how edb-terraform could be made into a module within library/edb_terraform.py, with edbterraform and ansible installed in the python environment:

#!/usr/bin/python
__metaclass__ = type

from ansible.module_utils.basic import AnsibleModule
from edbterraform import lib as edbterraform
from pathlib import Path

def run_module():
    module_args = dict(
        src=dict(type='str', required=True),
        dest=dict(type='str', required=True),
        cloud_service_provider=dict(type='str', required=True),
        validate=dict(type='bool', required=False, default=False),
    )

    result = dict(
        changed=True,
        original_message='original',
        message='new',
    )

    module = AnsibleModule(
        argument_spec=module_args,
        supports_check_mode=True,
    )

    edbterraform.generate(Path(module.params['src']), Path(module.params['dest']), module.params['cloud_service_provider'], module.params['validate'])

    module.exit_json(**result)

if __name__ == '__main__':
    run_module()

Sample playbook, play.yml:

- hosts: localhost
  tasks:
    - name: Generate terraform infrastructure files
      edb_terraform:
        src: ./infrastructure-examples/compute-engine.yml
        dest: ./testing_dest
        cloud_service_provider: gcloud
        validate: true

Executing the playbook:

$ ansible-playbook play.yml

[WARNING]: No inventory was parsed, only implicit localhost is available
[WARNING]: provided hosts list is empty, only localhost is available. Note that the implicit localhost does
not match 'all'

PLAY [localhost] *******************************************************************************************

TASK [Gathering Facts] *************************************************************************************
ok: [localhost]

TASK [Generate terraform infrastructure files] *************************************************************
changed: [localhost]

PLAY RECAP *************************************************************************************************
localhost                  : ok=2    changed=1    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0

Here is a code snippet from AWS POT. In here we are using an ansible playbook which uses the aws community module as well as edb-ansible's collection and terraform code. This is where edb-terraform will replace terraform's static files but we could leverage ansible to do this as a task, roles, modules, or mixed and further decouple our tools for reuse in other workflows such as edb-benchmarks, buildbot's production playbooks, edb-ansible, pgversions and more:

postgres-deployment/edbdeploy/projects/aws_pot.py

Lines 11 to 29 in c250581

    
           class AWSPOTProject(Project): 
        
               ansible_collection_name = 'edb_devops.edb_postgres:>=%s,<4.0.0' % __edb_ansible_version__  # noqa 
        
               aws_collection_name = 'community.aws:1.4.0' 
        
               def __init__(self, name, env, bin_path=None): 
        
                   super(AWSPOTProject, self).__init__('aws-pot', name, env, bin_path) 
        
                   # Use AWS terraform code 
        
                   self.terraform_path = os.path.join(self.terraform_share_path, 'aws') 
        
                   # POT only attributes 
        
                   self.ansible_pot_role = os.path.join(self.ansible_share_path, 'roles') 
        
                   # Route53 entry removal playbook 
        
                   self.ansible_route53_remove = os.path.join(self.ansible_share_path, 'POT-Remove-Project-Route53.yml') 
        
                   # TPAexec hooks path 
        
                   self.tpaexec_pot_hooks = os.path.join(self.tpaexec_share_path, 'hooks') 
        
                   self.custom_ssh_keys = {} 
        
                   # Force PG version to 14 in POT env. 
        
                   self.postgres_version = '14' 
        
                   self.operating_system = "RockyLinux8"

Short snippet of the aws community module being used in the playbook:

postgres-deployment/edbdeploy/data/ansible/roles/pot_setup/tasks/update_route53.yml

Lines 10 to 29 in c250581

    
           # router update 
        
           - name: get the route3 information 
        
             community.aws.route53: 
        
               aws_access_key: "{{ route53_access_key }}" 
        
               aws_secret_key: "{{ route53_secret }}" 
        
               state: get 
        
               zone: "{{ route53_zone }}" 
        
               record: "{{ route53_record }}" 
        
               type: A 
        
             register: rec 
        
             become: no 
        
             delegate_to: localhost 
        
           - name: store the value of existsing ip address in a variable 
        
             set_fact: 
        
               route_ip_addressess: "{{ route_ip_addressess + [record.Value] }}" 
        
             with_items: "{{ rec.set.ResourceRecords }}" 
        
             when: rec.set.ResourceRecords|length > 0 
        
             loop_control: 
        
               loop_var: record

EnterpriseDB/edb-terraform#23 (comment)

Some terraform modules might require multi-stage plans, applys, destroys

Terraform's command, terraform apply, might complete before all resources are ready for use. Causes ansible to fail as it quickly makes use of the resources and requires a timeout like below as a workaround:
https://github.com/EnterpriseDB/ebac/pull/26#issuecomment-1416246887
https://github.com/EnterpriseDB/edb-benchmarks/blob/main/aws-tpcc-tde/deploy/playbook-setup-fs.yml#L6-L9

edb-terraform will handle any provisioning of resources
- CLI Checks and other related tools will be integrated into edb-terraform and we can use tags and later on user-defined templates, to easily feed resources to ansible as is done with servers.yml file.
edb-ansible will continue to handle configuration of resources through roles, modules, plugins, as needed.

	def accept_terms(self, publisher, offer, sku, version):
	try:
	output = exec_shell([
	self.bin("az"),
	"vm",
	"image",
	"terms",
	"accept",
	"--urn %s:%s:%s:%s" % (publisher, offer, sku, version),
	])
	logging.debug("Command output: %s", output.decode("utf-8"))

	except CalledProcessError as e:
	logging.error("Failed to execute the command: %s", e.cmd)
	logging.error("Return code is: %s", e.returncode)
	logging.error("Output: %s", e.output)
	raise CloudCliError(
	"Failed to execute the following command, please check the "
	"logs for details: %s" % e.cmd
	)

	def check_instance_type_availability(self, instance_type, region):
	try:
	output = exec_shell([
	self.bin("az"),
	"vm",
	"list-sizes",
	"--location %s" % region,
	"--query \"[?name == '%s']\"" % instance_type,
	"--output json"
	])
	result = json.loads(output.decode("utf-8"))
	logging.debug("Command output: %s", result)
	if len(result) == 0:
	raise CloudCliError(
	"Instance type %s not available in region %s"
	% (instance_type, region)
	)
	except ValueError:
	# JSON decoding error
	logging.error("Failed to decode JSON data")
	logging.error("Output: %s", output.decode("utf-8"))
	raise CloudCliError(
	"Failed to decode JSON data, please check the logs for details"
	)
	except CalledProcessError as e:
	logging.error("Failed to execute the command: %s", e.cmd)
	logging.error("Return code is: %s", e.returncode)
	logging.error("Output: %s", e.output)
	raise CloudCliError(
	"Failed to execute the following command, please check the "
	"logs for details: %s" % e.cmd
	)

	def check_image_availability(self, publisher, offer, sku, region):
	try:
	output = exec_shell([
	self.bin("az"),
	"vm",
	"image",
	"list",
	"--all",
	"-p \"%s\"" % publisher,
	"-f \"%s\"" % offer,
	"-s \"%s\"" % sku,
	"-l %s" % region,
	"--query",
	"\"[?offer == '%s' && sku =='%s']\"" % (offer, sku),
	"--output json"
	])
	result = json.loads(output.decode("utf-8"))
	logging.debug("Command output: %s", result)
	if len(result) == 0:
	raise CloudCliError(
	"Image %s:%s:%s not available in region %s"
	% (publisher, offer, sku, region)
	)
	return result[0]
	except ValueError:
	# JSON decoding error
	logging.error("Failed to decode JSON data")
	logging.error("Output: %s", output.decode("utf-8"))
	raise CloudCliError(
	"Failed to decode JSON data, please check the logs for details"
	)
	except CalledProcessError as e:
	logging.error("Failed to execute the command: %s", e.cmd)
	logging.error("Return code is: %s", e.returncode)
	logging.error("Output: %s", e.output)
	raise CloudCliError(
	"Failed to execute the following command, please check the "
	"logs for details: %s" % e.cmd
	)

	def check_instances_availability(self, project_name):
	try:
	output = exec_shell([
	self.bin("az"),
	"vm",
	"wait",
	"--ids",
	"$(%s vm list -g \"%s_edb_resource_group\" --query \"[].id\" -o tsv)"
	% (self.bin("az"), project_name),
	"--created"
	])
	logging.debug("Command output: %s", output.decode("utf-8"))
	except CalledProcessError as e:
	logging.error("Failed to execute the command: %s", e.cmd)
	logging.error("Return code is: %s", e.returncode)
	logging.error("Output: %s", e.output)
	raise CloudCliError(
	"Failed to execute the following command, please check the "
	"logs for details: %s" % e.cmd
	)

	class AWSPOTProject(Project):

	ansible_collection_name = 'edb_devops.edb_postgres:>=%s,<4.0.0' % __edb_ansible_version__ # noqa
	aws_collection_name = 'community.aws:1.4.0'

	def __init__(self, name, env, bin_path=None):
	super(AWSPOTProject, self).__init__('aws-pot', name, env, bin_path)
	# Use AWS terraform code
	self.terraform_path = os.path.join(self.terraform_share_path, 'aws')
	# POT only attributes
	self.ansible_pot_role = os.path.join(self.ansible_share_path, 'roles')
	# Route53 entry removal playbook
	self.ansible_route53_remove = os.path.join(self.ansible_share_path, 'POT-Remove-Project-Route53.yml')
	# TPAexec hooks path
	self.tpaexec_pot_hooks = os.path.join(self.tpaexec_share_path, 'hooks')
	self.custom_ssh_keys = {}
	# Force PG version to 14 in POT env.
	self.postgres_version = '14'
	self.operating_system = "RockyLinux8"

	# router update
	- name: get the route3 information
	community.aws.route53:
	aws_access_key: "{{ route53_access_key }}"
	aws_secret_key: "{{ route53_secret }}"
	state: get
	zone: "{{ route53_zone }}"
	record: "{{ route53_record }}"
	type: A
	register: rec
	become: no
	delegate_to: localhost

	- name: store the value of existsing ip address in a variable
	set_fact:
	route_ip_addressess: "{{ route_ip_addressess + [record.Value] }}"
	with_items: "{{ rec.set.ResourceRecords }}"
	when: rec.set.ResourceRecords\|length > 0
	loop_control:
	loop_var: record