mirror of
https://github.com/jlengrand/engine.git
synced 2026-03-10 08:11:21 +00:00
feat: support scw node change without disruption (#587)
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,5 +1,6 @@
|
||||
**/target
|
||||
*.iml
|
||||
*.orig
|
||||
.idea
|
||||
.qovery-workspace
|
||||
.terraform/
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{% for scw_ks_worker_node in scw_ks_worker_nodes %}
|
||||
resource "scaleway_k8s_pool" "kubernetes_cluster_workers_{{ loop.index }}" {
|
||||
cluster_id = scaleway_k8s_cluster.kubernetes_cluster.id
|
||||
name = "${var.kubernetes_cluster_id}_{{ loop.index }}"
|
||||
name = "${var.kubernetes_cluster_id}_{{ scw_ks_worker_node.instance_type }}_{{ loop.index }}"
|
||||
node_type = "{{ scw_ks_worker_node.instance_type }}"
|
||||
|
||||
region = var.region
|
||||
@@ -13,11 +13,15 @@ resource "scaleway_k8s_pool" "kubernetes_cluster_workers_{{ loop.index }}" {
|
||||
size = "{{ scw_ks_worker_node.min_nodes }}"
|
||||
min_size = "{{ scw_ks_worker_node.min_nodes }}"
|
||||
max_size = "{{ scw_ks_worker_node.max_nodes }}"
|
||||
wait_for_pool_ready = true
|
||||
|
||||
depends_on = [
|
||||
scaleway_k8s_cluster.kubernetes_cluster,
|
||||
]
|
||||
|
||||
tags = concat(local.tags_ks_list, ["QoveryNodeGroupName:{{ scw_ks_worker_node.name }}", "QoveryNodeGroupId:${var.kubernetes_cluster_id}-{{ loop.index }}"])
|
||||
lifecycle {
|
||||
create_before_destroy = true
|
||||
}
|
||||
tags = concat(local.tags_ks_list, ["QoveryNodeGroupName:{{ scw_ks_worker_node.name }}", "QoveryNodeGroupId:${var.kubernetes_cluster_id}_{{ scw_ks_worker_node.instance_type }}_{{ loop.index }}"])
|
||||
}
|
||||
{% endfor %}
|
||||
@@ -77,7 +77,7 @@ variable "kubernetes_cluster_id" {
|
||||
|
||||
variable "kubernetes_cluster_name" {
|
||||
description = "Kubernetes cluster name"
|
||||
default = "qovery-{{ kubernetes_cluster_id }}" # TODO(benjaminch): handle name creation in code
|
||||
default = "{{ kubernetes_cluster_name }}"
|
||||
type = string
|
||||
}
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ terraform {
|
||||
required_providers {
|
||||
scaleway = {
|
||||
source = "scaleway/scaleway"
|
||||
version = "~> 2.1.0"
|
||||
version = "~> 2.2.0"
|
||||
}
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
@@ -28,7 +28,7 @@ terraform {
|
||||
version = "~> 2.24.1"
|
||||
}
|
||||
}
|
||||
required_version = ">= 0.13"
|
||||
required_version = ">= 0.14"
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -947,7 +947,10 @@ impl<'a> EKS<'a> {
|
||||
};
|
||||
|
||||
if tf_workers_resources.is_empty() {
|
||||
return Err(EngineError::new_cluster_has_no_worker_nodes(event_details.clone()));
|
||||
return Err(EngineError::new_cluster_has_no_worker_nodes(
|
||||
event_details.clone(),
|
||||
None,
|
||||
));
|
||||
}
|
||||
|
||||
let kubernetes_config_file_path = self.get_kubeconfig_file_path()?;
|
||||
|
||||
@@ -87,6 +87,7 @@ mod tests {
|
||||
NodeGroups::new("".to_string(), 2, 2, "t2.large".to_string(), 20).unwrap(),
|
||||
NodeGroups {
|
||||
name: "".to_string(),
|
||||
id: None,
|
||||
min_nodes: 2,
|
||||
max_nodes: 2,
|
||||
instance_type: "t2.large".to_string(),
|
||||
|
||||
@@ -133,6 +133,7 @@ mod tests {
|
||||
NodeGroups::new("".to_string(), 2, 2, "s-2vcpu-4gb".to_string(), 20).unwrap(),
|
||||
NodeGroups {
|
||||
name: "".to_string(),
|
||||
id: None,
|
||||
min_nodes: 2,
|
||||
max_nodes: 2,
|
||||
instance_type: "s-2vcpu-4gb".to_string(),
|
||||
|
||||
@@ -34,7 +34,7 @@ use crate::fs::workspace_directory;
|
||||
use crate::logger::{LogLevel, Logger};
|
||||
use crate::models::ProgressLevel::Info;
|
||||
use crate::models::{
|
||||
Action, Context, Listen, ListenersHelper, ProgressInfo, ProgressLevel, ProgressScope, QoveryIdentifier,
|
||||
Action, Context, Listen, ListenersHelper, ProgressInfo, ProgressLevel, ProgressScope, QoveryIdentifier, StringPath,
|
||||
};
|
||||
use crate::object_storage::ObjectStorage;
|
||||
use crate::unit_conversion::{any_to_mi, cpu_string_to_float};
|
||||
@@ -78,12 +78,46 @@ pub trait Kubernetes: Listen {
|
||||
)
|
||||
}
|
||||
|
||||
fn get_kubeconfig_filename(&self) -> String {
|
||||
format!("{}.yaml", self.id())
|
||||
}
|
||||
|
||||
fn get_kubeconfig_file(&self) -> Result<(String, File), EngineError> {
|
||||
let event_details = self.get_event_details(Infrastructure(InfrastructureStep::LoadConfiguration));
|
||||
let bucket_name = format!("qovery-kubeconfigs-{}", self.id());
|
||||
let object_key = format!("{}.yaml", self.id());
|
||||
let object_key = self.get_kubeconfig_filename();
|
||||
let stage = Stage::General(GeneralStep::RetrieveClusterConfig);
|
||||
|
||||
let (string_path, file) = match self
|
||||
// check if kubeconfig locally exists
|
||||
let local_kubeconfig = match self.get_temp_dir(event_details) {
|
||||
Ok(x) => {
|
||||
let local_kubeconfig_folder_path = format!("{}/{}", &x, &bucket_name);
|
||||
let local_kubeconfig_generated = format!("{}/{}", &local_kubeconfig_folder_path, &object_key);
|
||||
if Path::new(&local_kubeconfig_generated).exists() {
|
||||
match File::open(&local_kubeconfig_generated) {
|
||||
Ok(_) => Some(local_kubeconfig_generated),
|
||||
Err(_) => {
|
||||
debug!("couldn't open {} file", &local_kubeconfig_generated);
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
Err(_) => None,
|
||||
};
|
||||
|
||||
// otherwise, try to get it from object storage
|
||||
let (string_path, file) = match local_kubeconfig {
|
||||
Some(local_kubeconfig_generated) => {
|
||||
let kubeconfig_file =
|
||||
File::open(&local_kubeconfig_generated).expect("couldn't read kubeconfig file, but file exists");
|
||||
|
||||
(StringPath::from(&local_kubeconfig_generated), kubeconfig_file)
|
||||
}
|
||||
None => {
|
||||
match self
|
||||
.config_file_store()
|
||||
.get(bucket_name.as_str(), object_key.as_str(), true)
|
||||
{
|
||||
@@ -102,6 +136,8 @@ pub trait Kubernetes: Listen {
|
||||
self.logger().log(LogLevel::Error, EngineEvent::Error(error.clone()));
|
||||
return Err(error);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let metadata = match file.metadata() {
|
||||
@@ -233,11 +269,12 @@ pub trait Kubernetes: Listen {
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
let kubeconfig = match self.get_kubeconfig_file() {
|
||||
Ok((path, _)) => path,
|
||||
Err(e) => return Err(CommandError::new(e.message(), None)),
|
||||
};
|
||||
send_progress_on_long_task(self, Action::Create, || {
|
||||
check_workers_status(
|
||||
self.get_kubeconfig_file_path().expect("Unable to get Kubeconfig"),
|
||||
self.cloud_provider().credentials_environment_variables(),
|
||||
)
|
||||
check_workers_status(&kubeconfig, self.cloud_provider().credentials_environment_variables())
|
||||
})
|
||||
}
|
||||
fn upgrade_with_status(&self, kubernetes_upgrade_status: KubernetesUpgradeStatus) -> Result<(), EngineError>;
|
||||
@@ -436,11 +473,29 @@ pub fn deploy_environment(
|
||||
"deployment",
|
||||
CheckAction::Deploy,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Quick fix: adding 100 ms delay to avoid race condition on service status update
|
||||
thread::sleep(std::time::Duration::from_millis(100));
|
||||
|
||||
// check all deployed services
|
||||
for service in &environment.stateful_services {
|
||||
let _ = service::check_kubernetes_service_error(
|
||||
service.exec_check_action(),
|
||||
kubernetes,
|
||||
service,
|
||||
event_details.clone(),
|
||||
&stateless_deployment_target,
|
||||
&listeners_helper,
|
||||
"check deployment",
|
||||
CheckAction::Deploy,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Quick fix: adding 100 ms delay to avoid race condition on service status update
|
||||
thread::sleep(std::time::Duration::from_millis(100));
|
||||
|
||||
for service in &environment.stateless_services {
|
||||
let _ = service::check_kubernetes_service_error(
|
||||
service.exec_check_action(),
|
||||
kubernetes,
|
||||
@@ -1192,6 +1247,7 @@ impl NodeGroups {
|
||||
|
||||
Ok(NodeGroups {
|
||||
name: group_name,
|
||||
id: None,
|
||||
min_nodes,
|
||||
max_nodes,
|
||||
instance_type,
|
||||
|
||||
@@ -65,6 +65,7 @@ pub struct CpuLimits {
|
||||
#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)]
|
||||
pub struct NodeGroups {
|
||||
pub name: String,
|
||||
pub id: Option<String>,
|
||||
pub min_nodes: i32,
|
||||
pub max_nodes: i32,
|
||||
pub instance_type: String,
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::cloud_provider::models::{NodeGroups, NodeGroupsFormat};
|
||||
use crate::cloud_provider::qovery::EngineLocation;
|
||||
use crate::cloud_provider::scaleway::application::ScwZone;
|
||||
use crate::cloud_provider::scaleway::kubernetes::helm_charts::{scw_helm_charts, ChartsConfigPrerequisites};
|
||||
use crate::cloud_provider::scaleway::kubernetes::node::ScwInstancesType;
|
||||
use crate::cloud_provider::scaleway::kubernetes::node::{ScwInstancesType, ScwNodeGroup};
|
||||
use crate::cloud_provider::utilities::print_action;
|
||||
use crate::cloud_provider::{kubernetes, CloudProvider};
|
||||
use crate::cmd::kubectl::{kubectl_exec_api_custom_metrics, kubectl_exec_get_all_namespaces, kubectl_exec_get_events};
|
||||
@@ -29,18 +29,32 @@ use crate::models::{
|
||||
};
|
||||
use crate::object_storage::scaleway_object_storage::{BucketDeleteStrategy, ScalewayOS};
|
||||
use crate::object_storage::ObjectStorage;
|
||||
use crate::runtime::block_on;
|
||||
use crate::string::terraform_list_format;
|
||||
use crate::{cmd, dns_provider};
|
||||
use ::function_name::named;
|
||||
use reqwest::StatusCode;
|
||||
use retry::delay::{Fibonacci, Fixed};
|
||||
use retry::Error::Operation;
|
||||
use retry::OperationResult;
|
||||
use scaleway_api_rs::apis::Error;
|
||||
use scaleway_api_rs::models::ScalewayK8sV1Cluster;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::env;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
use tera::Context as TeraContext;
|
||||
|
||||
#[derive(PartialEq)]
|
||||
pub enum ScwNodeGroupErrors {
|
||||
CloudProviderApiError(CommandError),
|
||||
ClusterDoesNotExists(CommandError),
|
||||
MultipleClusterFound,
|
||||
NoNodePoolFound(CommandError),
|
||||
MissingNodePoolInfo,
|
||||
NodeGroupValidationError(CommandError),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct KapsuleOptions {
|
||||
// Qovery
|
||||
@@ -193,6 +207,210 @@ impl<'a> Kapsule<'a> {
|
||||
})
|
||||
}
|
||||
|
||||
fn get_configuration(&self) -> scaleway_api_rs::apis::configuration::Configuration {
|
||||
scaleway_api_rs::apis::configuration::Configuration {
|
||||
api_key: Some(scaleway_api_rs::apis::configuration::ApiKey {
|
||||
key: self.options.scaleway_secret_key.clone(),
|
||||
prefix: None,
|
||||
}),
|
||||
..scaleway_api_rs::apis::configuration::Configuration::default()
|
||||
}
|
||||
}
|
||||
|
||||
fn get_scw_cluster_info(&self) -> Result<Option<ScalewayK8sV1Cluster>, EngineError> {
|
||||
let event_details = self.get_event_details(Infrastructure(InfrastructureStep::LoadConfiguration));
|
||||
|
||||
// get cluster info
|
||||
let cluster_info = match block_on(scaleway_api_rs::apis::clusters_api::list_clusters(
|
||||
&self.get_configuration(),
|
||||
self.region().as_str(),
|
||||
None,
|
||||
Some(self.options.scaleway_project_id.as_str()),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
Some(self.cluster_name().as_str()),
|
||||
None,
|
||||
None,
|
||||
)) {
|
||||
Ok(x) => x,
|
||||
Err(e) => {
|
||||
let msg = format!("wasn't able to retrieve SCW cluster information from the API. {:?}", e);
|
||||
return Err(EngineError::new_cannot_get_cluster_error(
|
||||
event_details.clone(),
|
||||
CommandError::new(msg.clone(), Some(msg)),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
// if no cluster exists
|
||||
let cluster_info_content = cluster_info.clusters.unwrap();
|
||||
if &cluster_info_content.len() == &(0 as usize) {
|
||||
return Ok(None);
|
||||
} else if &cluster_info_content.len() != &(1 as usize) {
|
||||
let msg = format!(
|
||||
"too many clusters found with this name, where 1 was expected. {:?}",
|
||||
&cluster_info_content.len()
|
||||
);
|
||||
return Err(EngineError::new_multiple_cluster_found_expected_one_error(
|
||||
event_details,
|
||||
CommandError::new(msg.clone(), Some(msg)),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(Some(cluster_info_content[0].clone()))
|
||||
}
|
||||
|
||||
fn get_existing_sanitized_node_groups(
|
||||
&self,
|
||||
cluster_info: ScalewayK8sV1Cluster,
|
||||
) -> Result<Vec<ScwNodeGroup>, ScwNodeGroupErrors> {
|
||||
let error_cluster_id = format!("expected cluster id for this Scaleway cluster");
|
||||
let cluster_id = match cluster_info.id {
|
||||
None => {
|
||||
return Err(ScwNodeGroupErrors::NodeGroupValidationError(
|
||||
CommandError::new_from_safe_message(error_cluster_id),
|
||||
))
|
||||
}
|
||||
Some(x) => x,
|
||||
};
|
||||
|
||||
let pools = match block_on(scaleway_api_rs::apis::pools_api::list_pools(
|
||||
&self.get_configuration(),
|
||||
self.region().as_str(),
|
||||
cluster_id.as_str(),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)) {
|
||||
Ok(x) => x,
|
||||
Err(e) => {
|
||||
let msg = format!("error while trying to get SCW pool info from cluster {}", &cluster_id);
|
||||
let msg_with_error = format!("{}. {:?}", msg.clone(), e);
|
||||
return Err(ScwNodeGroupErrors::CloudProviderApiError(CommandError::new(
|
||||
msg_with_error,
|
||||
Some(msg),
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
// ensure pool are present
|
||||
if pools.pools.is_none() {
|
||||
let msg = format!(
|
||||
"No SCW pool found from the SCW API for cluster {}/{}",
|
||||
&cluster_id,
|
||||
&cluster_info.name.unwrap_or("unknown cluster".to_string())
|
||||
);
|
||||
return Err(ScwNodeGroupErrors::NoNodePoolFound(CommandError::new(
|
||||
msg.clone(),
|
||||
Some(msg),
|
||||
)));
|
||||
}
|
||||
|
||||
// create sanitized nodegroup pools
|
||||
let mut nodegroup_pool: Vec<ScwNodeGroup> = Vec::with_capacity(pools.total_count.unwrap_or(0 as f32) as usize);
|
||||
for ng in pools.pools.unwrap() {
|
||||
if ng.id.is_none() {
|
||||
let msg = format!(
|
||||
"error while trying to validate SCW pool ID from cluster {}",
|
||||
&cluster_id
|
||||
);
|
||||
return Err(ScwNodeGroupErrors::NodeGroupValidationError(CommandError::new(
|
||||
msg.clone(),
|
||||
Some(msg),
|
||||
)));
|
||||
}
|
||||
let ng_sanitized = self.get_node_group_info(ng.id.unwrap().as_str())?;
|
||||
nodegroup_pool.push(ng_sanitized)
|
||||
}
|
||||
|
||||
Ok(nodegroup_pool)
|
||||
}
|
||||
|
||||
fn get_node_group_info(&self, pool_id: &str) -> Result<ScwNodeGroup, ScwNodeGroupErrors> {
|
||||
let pool = match block_on(scaleway_api_rs::apis::pools_api::get_pool(
|
||||
&self.get_configuration(),
|
||||
self.region().as_str(),
|
||||
pool_id,
|
||||
)) {
|
||||
Ok(x) => x,
|
||||
Err(e) => {
|
||||
return Err(match e {
|
||||
Error::ResponseError(x) => {
|
||||
let msg_with_error = format!(
|
||||
"Error code while getting node group: {}, API message: {} ",
|
||||
x.status, x.content
|
||||
);
|
||||
match x.status {
|
||||
StatusCode::NOT_FOUND => ScwNodeGroupErrors::NoNodePoolFound(CommandError::new(
|
||||
msg_with_error,
|
||||
Some("No node pool found".to_string()),
|
||||
)),
|
||||
_ => ScwNodeGroupErrors::CloudProviderApiError(CommandError::new(
|
||||
msg_with_error,
|
||||
Some("Scaleway API error while trying to get node group".to_string()),
|
||||
)),
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let msg = "This Scaleway API error is not supported in the engine, please add it to better support it".to_string();
|
||||
ScwNodeGroupErrors::NodeGroupValidationError(CommandError::new(msg.clone(), Some(msg)))
|
||||
}
|
||||
})
|
||||
}
|
||||
};
|
||||
|
||||
// ensure there is no missing info
|
||||
if let Err(e) = self.check_missing_nodegroup_info(&pool.name, "name") {
|
||||
return Err(e);
|
||||
};
|
||||
if let Err(e) = self.check_missing_nodegroup_info(&pool.min_size, "min_size") {
|
||||
return Err(e);
|
||||
};
|
||||
if let Err(e) = self.check_missing_nodegroup_info(&pool.max_size, "max_size") {
|
||||
return Err(e);
|
||||
};
|
||||
if let Err(e) = self.check_missing_nodegroup_info(&pool.status, "status") {
|
||||
return Err(e);
|
||||
};
|
||||
|
||||
match ScwNodeGroup::new(
|
||||
pool.id,
|
||||
pool.name.unwrap(),
|
||||
pool.min_size.unwrap() as i32,
|
||||
pool.max_size.unwrap() as i32,
|
||||
pool.node_type,
|
||||
pool.size as i32,
|
||||
pool.status.unwrap(),
|
||||
) {
|
||||
Ok(x) => Ok(x),
|
||||
Err(e) => Err(ScwNodeGroupErrors::NodeGroupValidationError(e)),
|
||||
}
|
||||
}
|
||||
|
||||
fn check_missing_nodegroup_info<T>(&self, item: &Option<T>, name: &str) -> Result<(), ScwNodeGroupErrors> {
|
||||
let event_details = self.get_event_details(Infrastructure(InfrastructureStep::LoadConfiguration));
|
||||
|
||||
self.logger.log(
|
||||
LogLevel::Error,
|
||||
EngineEvent::Error(EngineError::new_missing_workers_group_info_error(
|
||||
event_details,
|
||||
CommandError::new_from_safe_message(format!(
|
||||
"Missing node pool info {} for cluster {}",
|
||||
name,
|
||||
self.context.cluster_id()
|
||||
)),
|
||||
)),
|
||||
);
|
||||
|
||||
if item.is_none() {
|
||||
return Err(ScwNodeGroupErrors::MissingNodePoolInfo);
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn kubeconfig_bucket_name(&self) -> String {
|
||||
format!("qovery-kubeconfigs-{}", self.id())
|
||||
}
|
||||
@@ -256,7 +474,7 @@ impl<'a> Kapsule<'a> {
|
||||
// Kubernetes
|
||||
context.insert("test_cluster", &self.context.is_test_cluster());
|
||||
context.insert("kubernetes_cluster_id", self.id());
|
||||
context.insert("kubernetes_cluster_name", self.name());
|
||||
context.insert("kubernetes_cluster_name", self.cluster_name().as_str());
|
||||
context.insert("kubernetes_cluster_version", self.version());
|
||||
|
||||
// Qovery
|
||||
@@ -579,6 +797,192 @@ impl<'a> Kapsule<'a> {
|
||||
return Err(error);
|
||||
}
|
||||
|
||||
let cluster_info = self.get_scw_cluster_info()?;
|
||||
if cluster_info.is_none() {
|
||||
let msg = "no cluster found from the Scaleway API".to_string();
|
||||
return Err(EngineError::new_no_cluster_found_error(
|
||||
event_details.clone(),
|
||||
CommandError::new(msg.clone(), Some(msg)),
|
||||
));
|
||||
}
|
||||
|
||||
let current_nodegroups = match self
|
||||
.get_existing_sanitized_node_groups(cluster_info.expect("A cluster should be present at this create stage"))
|
||||
{
|
||||
Ok(x) => x,
|
||||
Err(e) => {
|
||||
match e {
|
||||
ScwNodeGroupErrors::CloudProviderApiError(c) => {
|
||||
return Err(EngineError::new_missing_api_info_from_cloud_provider_error(
|
||||
event_details.clone(),
|
||||
Some(c),
|
||||
))
|
||||
}
|
||||
ScwNodeGroupErrors::ClusterDoesNotExists(_) => self.logger().log(
|
||||
LogLevel::Info,
|
||||
EngineEvent::Deploying(
|
||||
event_details.clone(),
|
||||
EventMessage::new_from_safe(
|
||||
"cluster do not exists, no node groups can be retrieved for upgrade check".to_string(),
|
||||
),
|
||||
),
|
||||
),
|
||||
ScwNodeGroupErrors::MultipleClusterFound => {
|
||||
let msg = "multiple clusters found, can't match the correct node groups".to_string();
|
||||
return Err(EngineError::new_multiple_cluster_found_expected_one_error(
|
||||
event_details.clone(),
|
||||
CommandError::new(msg.clone(), Some(msg)),
|
||||
));
|
||||
}
|
||||
ScwNodeGroupErrors::NoNodePoolFound(_) => self.logger().log(
|
||||
LogLevel::Info,
|
||||
EngineEvent::Deploying(
|
||||
event_details.clone(),
|
||||
EventMessage::new_from_safe(
|
||||
"cluster exists, but no node groups found for upgrade check".to_string(),
|
||||
),
|
||||
),
|
||||
),
|
||||
ScwNodeGroupErrors::MissingNodePoolInfo => {
|
||||
let msg = format!("Error with Scaleway API while trying to retrieve node pool info");
|
||||
return Err(EngineError::new_missing_api_info_from_cloud_provider_error(
|
||||
event_details.clone(),
|
||||
Some(CommandError::new_from_safe_message(msg)),
|
||||
));
|
||||
}
|
||||
ScwNodeGroupErrors::NodeGroupValidationError(c) => {
|
||||
return Err(EngineError::new_missing_api_info_from_cloud_provider_error(
|
||||
event_details.clone(),
|
||||
Some(c),
|
||||
));
|
||||
}
|
||||
};
|
||||
Vec::with_capacity(0)
|
||||
}
|
||||
};
|
||||
|
||||
// ensure all node groups are in ready state Scaleway side
|
||||
self.logger.log(
|
||||
LogLevel::Info,
|
||||
EngineEvent::Deploying(
|
||||
event_details.clone(),
|
||||
EventMessage::new_from_safe(
|
||||
"ensuring all groups nodes are in ready state from the Scaleway API".to_string(),
|
||||
),
|
||||
),
|
||||
);
|
||||
|
||||
for ng in current_nodegroups {
|
||||
let res = retry::retry(
|
||||
// retry 10 min max per nodegroup until they are ready
|
||||
Fixed::from_millis(15000).take(40),
|
||||
|| {
|
||||
self.logger().log(
|
||||
LogLevel::Info,
|
||||
EngineEvent::Deploying(
|
||||
event_details.clone(),
|
||||
EventMessage::new_from_safe(format!(
|
||||
"checking node group {}/{:?}, current status: {:?}",
|
||||
&ng.name,
|
||||
&ng.id.as_ref().unwrap_or(&"unknown".to_string()),
|
||||
&ng.status
|
||||
)),
|
||||
),
|
||||
);
|
||||
let pool_id = match &ng.id {
|
||||
None => {
|
||||
let msg =
|
||||
"node group id was expected to get info, but not found from Scaleway API".to_string();
|
||||
return OperationResult::Retry(
|
||||
EngineError::new_missing_api_info_from_cloud_provider_error(
|
||||
event_details.clone(),
|
||||
Some(CommandError::new_from_safe_message(msg)),
|
||||
),
|
||||
);
|
||||
}
|
||||
Some(x) => x,
|
||||
};
|
||||
let scw_ng = match self.get_node_group_info(pool_id.as_str()) {
|
||||
Ok(x) => x,
|
||||
Err(e) => {
|
||||
return match e {
|
||||
ScwNodeGroupErrors::CloudProviderApiError(c) => {
|
||||
let current_error = EngineError::new_missing_api_info_from_cloud_provider_error(
|
||||
event_details.clone(),
|
||||
Some(c),
|
||||
);
|
||||
self.logger
|
||||
.log(LogLevel::Error, EngineEvent::Error(current_error.clone()));
|
||||
OperationResult::Retry(current_error)
|
||||
}
|
||||
ScwNodeGroupErrors::ClusterDoesNotExists(c) => {
|
||||
let current_error =
|
||||
EngineError::new_no_cluster_found_error(event_details.clone(), c);
|
||||
self.logger
|
||||
.log(LogLevel::Error, EngineEvent::Error(current_error.clone()));
|
||||
OperationResult::Retry(current_error)
|
||||
}
|
||||
ScwNodeGroupErrors::MultipleClusterFound => {
|
||||
OperationResult::Retry(EngineError::new_multiple_cluster_found_expected_one_error(
|
||||
event_details.clone(),
|
||||
CommandError::new_from_safe_message(
|
||||
"Multiple cluster found while one was expected".to_string(),
|
||||
),
|
||||
))
|
||||
}
|
||||
ScwNodeGroupErrors::NoNodePoolFound(_) => OperationResult::Ok(()),
|
||||
ScwNodeGroupErrors::MissingNodePoolInfo => {
|
||||
OperationResult::Retry(EngineError::new_missing_api_info_from_cloud_provider_error(
|
||||
event_details.clone(),
|
||||
None,
|
||||
))
|
||||
}
|
||||
ScwNodeGroupErrors::NodeGroupValidationError(c) => {
|
||||
let current_error = EngineError::new_missing_api_info_from_cloud_provider_error(
|
||||
event_details.clone(),
|
||||
Some(c),
|
||||
);
|
||||
self.logger
|
||||
.log(LogLevel::Error, EngineEvent::Error(current_error.clone()));
|
||||
OperationResult::Retry(current_error)
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
match scw_ng.status == scaleway_api_rs::models::scaleway_k8s_v1_pool::Status::Ready {
|
||||
true => OperationResult::Ok(()),
|
||||
false => OperationResult::Retry(EngineError::new_k8s_node_not_ready(
|
||||
event_details.clone(),
|
||||
CommandError::new_from_safe_message(format!(
|
||||
"waiting for node group {} to be ready, current status: {:?}",
|
||||
&scw_ng.name, scw_ng.status
|
||||
)),
|
||||
)),
|
||||
}
|
||||
},
|
||||
);
|
||||
match res {
|
||||
Ok(_) => {}
|
||||
Err(Operation { error, .. }) => return Err(error),
|
||||
Err(retry::Error::Internal(msg)) => {
|
||||
return Err(EngineError::new_k8s_node_not_ready(
|
||||
event_details.clone(),
|
||||
CommandError::new(msg, Some("Waiting for too long worker nodes to be ready".to_string())),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
self.logger.log(
|
||||
LogLevel::Info,
|
||||
EngineEvent::Deploying(
|
||||
event_details.clone(),
|
||||
EventMessage::new_from_safe(
|
||||
"all node groups for this cluster are ready from cloud provider API".to_string(),
|
||||
),
|
||||
),
|
||||
);
|
||||
|
||||
// ensure all nodes are ready on Kubernetes
|
||||
match self.check_workers_on_create() {
|
||||
Ok(_) => {
|
||||
self.send_to_customer(
|
||||
@@ -793,7 +1197,10 @@ impl<'a> Kapsule<'a> {
|
||||
};
|
||||
|
||||
if tf_workers_resources.is_empty() {
|
||||
return Err(EngineError::new_cluster_has_no_worker_nodes(event_details.clone()));
|
||||
return Err(EngineError::new_cluster_has_no_worker_nodes(
|
||||
event_details.clone(),
|
||||
None,
|
||||
));
|
||||
}
|
||||
|
||||
let kubernetes_config_file_path = self.get_kubeconfig_file_path()?;
|
||||
|
||||
@@ -88,6 +88,47 @@ impl FromStr for ScwInstancesType {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct ScwNodeGroup {
|
||||
pub name: String,
|
||||
pub id: Option<String>,
|
||||
pub min_nodes: i32,
|
||||
pub max_nodes: i32,
|
||||
pub instance_type: String,
|
||||
pub disk_size_in_gib: i32,
|
||||
pub status: scaleway_api_rs::models::scaleway_k8s_v1_pool::Status,
|
||||
}
|
||||
|
||||
impl ScwNodeGroup {
|
||||
pub fn new(
|
||||
id: Option<String>,
|
||||
group_name: String,
|
||||
min_nodes: i32,
|
||||
max_nodes: i32,
|
||||
instance_type: String,
|
||||
disk_size_in_gib: i32,
|
||||
status: scaleway_api_rs::models::scaleway_k8s_v1_pool::Status,
|
||||
) -> Result<Self, CommandError> {
|
||||
if min_nodes > max_nodes {
|
||||
let msg = format!(
|
||||
"The number of minimum nodes ({}) for group name {} is higher than maximum nodes ({})",
|
||||
&group_name, &min_nodes, &max_nodes
|
||||
);
|
||||
return Err(CommandError::new_from_safe_message(msg));
|
||||
}
|
||||
|
||||
Ok(ScwNodeGroup {
|
||||
name: group_name,
|
||||
id,
|
||||
min_nodes,
|
||||
max_nodes,
|
||||
instance_type,
|
||||
disk_size_in_gib,
|
||||
status,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[cfg(test)]
|
||||
@@ -104,6 +145,7 @@ mod tests {
|
||||
NodeGroups::new("".to_string(), 2, 2, "dev1-l".to_string(), 20).unwrap(),
|
||||
NodeGroups {
|
||||
name: "".to_string(),
|
||||
id: None,
|
||||
min_nodes: 2,
|
||||
max_nodes: 2,
|
||||
instance_type: "dev1-l".to_string(),
|
||||
|
||||
@@ -76,6 +76,9 @@ pub enum Tag {
|
||||
CannotGetCluster,
|
||||
ObjectStorageCannotCreateBucket,
|
||||
ObjectStorageCannotPutFileIntoBucket,
|
||||
NoClusterFound,
|
||||
OnlyOneClusterExpected,
|
||||
CloudProviderApiMissingInfo,
|
||||
}
|
||||
|
||||
impl From<errors::Tag> for Tag {
|
||||
@@ -139,6 +142,9 @@ impl From<errors::Tag> for Tag {
|
||||
errors::Tag::UnsupportedZone => Tag::UnsupportedZone,
|
||||
errors::Tag::K8sNodeIsNotReadyWithTheRequestedVersion => Tag::K8sNodeIsNotReadyWithTheRequestedVersion,
|
||||
errors::Tag::K8sNodeIsNotReady => Tag::K8sNodeIsNotReady,
|
||||
errors::Tag::NoClusterFound => Tag::NoClusterFound,
|
||||
errors::Tag::OnlyOneClusterExpected => Tag::OnlyOneClusterExpected,
|
||||
errors::Tag::CloudProviderApiMissingInfo => Tag::CloudProviderApiMissingInfo,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,6 +88,8 @@ pub enum Tag {
|
||||
Unknown,
|
||||
/// MissingRequiredEnvVariable: represents an error where a required env variable is not set.
|
||||
MissingRequiredEnvVariable,
|
||||
/// NoClusterFound: represents an error where no cluster was found
|
||||
NoClusterFound,
|
||||
/// ClusterHasNoWorkerNodes: represents an error where the current cluster doesn't have any worker nodes.
|
||||
ClusterHasNoWorkerNodes,
|
||||
/// CannotGetWorkspaceDirectory: represents an error while trying to get workspace directory.
|
||||
@@ -188,10 +190,14 @@ pub enum Tag {
|
||||
CannotGetSupportedVersions,
|
||||
/// CannotGetCluster: represents an error where we cannot get cluster.
|
||||
CannotGetCluster,
|
||||
/// OnlyOneClusterExpected: represents an error where only one cluster was expected but several where found
|
||||
OnlyOneClusterExpected,
|
||||
/// ObjectStorageCannotCreateBucket: represents an error while trying to create a new object storage bucket.
|
||||
ObjectStorageCannotCreateBucket,
|
||||
/// ObjectStorageCannotPutFileIntoBucket: represents an error while trying to put a file into an object storage bucket.
|
||||
ObjectStorageCannotPutFileIntoBucket,
|
||||
/// CloudProviderApiMissingInfo: represents an error while expecting mandatory info
|
||||
CloudProviderApiMissingInfo,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -353,14 +359,18 @@ impl EngineError {
|
||||
/// Arguments:
|
||||
///
|
||||
/// * `event_details`: Error linked event details.
|
||||
pub fn new_cluster_has_no_worker_nodes(event_details: EventDetails) -> EngineError {
|
||||
/// * `raw_error`: Raw error message.
|
||||
pub fn new_cluster_has_no_worker_nodes(
|
||||
event_details: EventDetails,
|
||||
raw_error: Option<CommandError>,
|
||||
) -> EngineError {
|
||||
let message = "No worker nodes present, can't proceed with operation.";
|
||||
EngineError::new(
|
||||
event_details,
|
||||
Tag::ClusterHasNoWorkerNodes,
|
||||
message.to_string(),
|
||||
message.to_string(),
|
||||
None,
|
||||
raw_error,
|
||||
None,
|
||||
Some(
|
||||
"This can happen if there where a manual operations on the workers or the infrastructure is paused."
|
||||
@@ -369,6 +379,32 @@ impl EngineError {
|
||||
)
|
||||
}
|
||||
|
||||
/// Missing API info from the Cloud provider itself
|
||||
///
|
||||
///
|
||||
///
|
||||
/// Arguments:
|
||||
///
|
||||
/// * `event_details`: Error linked event details.
|
||||
/// * `raw_error`: Raw error message.
|
||||
pub fn new_missing_api_info_from_cloud_provider_error(
|
||||
event_details: EventDetails,
|
||||
raw_error: Option<CommandError>,
|
||||
) -> EngineError {
|
||||
let message = "Error, missing required information from the Cloud Provider API";
|
||||
EngineError::new(
|
||||
event_details,
|
||||
Tag::CloudProviderApiMissingInfo,
|
||||
message.to_string(),
|
||||
message.to_string(),
|
||||
raw_error,
|
||||
None,
|
||||
Some(
|
||||
"This can happen if the cloud provider is encountering issues. You should try again later".to_string(),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
/// Creates new error for unsupported instance type.
|
||||
///
|
||||
/// Cloud provider doesn't support the requested instance type.
|
||||
@@ -1678,4 +1714,67 @@ impl EngineError {
|
||||
Some("Maybe there is a lag and cluster is not yet reported, please retry later.".to_string()),
|
||||
)
|
||||
}
|
||||
|
||||
/// Creates new error while trying to get cluster.
|
||||
///
|
||||
/// Arguments:
|
||||
///
|
||||
/// * `event_details`: Error linked event details.
|
||||
/// * `raw_error`: Raw error message.
|
||||
pub fn new_missing_workers_group_info_error(event_details: EventDetails, raw_error: CommandError) -> EngineError {
|
||||
let message = "Error, cannot get cluster.";
|
||||
|
||||
EngineError::new(
|
||||
event_details,
|
||||
Tag::CannotGetCluster,
|
||||
message.to_string(),
|
||||
message.to_string(),
|
||||
Some(raw_error),
|
||||
None,
|
||||
Some("Maybe there is a lag and cluster is not yet reported, please retry later.".to_string()),
|
||||
)
|
||||
}
|
||||
|
||||
/// No cluster found
|
||||
///
|
||||
/// Arguments:
|
||||
///
|
||||
/// * `event_details`: Error linked event details.
|
||||
/// * `raw_error`: Raw error message.
|
||||
pub fn new_no_cluster_found_error(event_details: EventDetails, raw_error: CommandError) -> EngineError {
|
||||
let message = "Error, no cluster found.";
|
||||
|
||||
EngineError::new(
|
||||
event_details,
|
||||
Tag::CannotGetCluster,
|
||||
message.to_string(),
|
||||
message.to_string(),
|
||||
Some(raw_error),
|
||||
None,
|
||||
Some("Maybe there is a lag and cluster is not yet reported, please retry later.".to_string()),
|
||||
)
|
||||
}
|
||||
|
||||
/// Too many clusters found, while expected only one
|
||||
///
|
||||
/// Arguments:
|
||||
///
|
||||
/// * `event_details`: Error linked event details.
|
||||
/// * `raw_error`: Raw error message.
|
||||
pub fn new_multiple_cluster_found_expected_one_error(
|
||||
event_details: EventDetails,
|
||||
raw_error: CommandError,
|
||||
) -> EngineError {
|
||||
let message = "Too many clusters found with this name, where 1 was expected";
|
||||
|
||||
EngineError::new(
|
||||
event_details,
|
||||
Tag::OnlyOneClusterExpected,
|
||||
message.to_string(),
|
||||
message.to_string(),
|
||||
Some(raw_error),
|
||||
None,
|
||||
Some("Please contact Qovery support for investigation.".to_string()),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,7 +102,7 @@ fn test_build_cache() {
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
let mut environment = test_utilities::common::working_minimal_environment(
|
||||
let environment = test_utilities::common::working_minimal_environment(
|
||||
&context,
|
||||
secrets
|
||||
.DEFAULT_TEST_DOMAIN
|
||||
@@ -118,9 +118,9 @@ fn test_build_cache() {
|
||||
let app_build = app.to_build();
|
||||
let _ = match local_docker.has_cache(&app_build) {
|
||||
Ok(CacheResult::Hit) => assert!(false),
|
||||
Ok(CacheResult::Miss(parent_build)) => assert!(true),
|
||||
Ok(CacheResult::Miss(_)) => assert!(true),
|
||||
Ok(CacheResult::MissWithoutParentBuild) => assert!(false),
|
||||
Err(err) => assert!(false),
|
||||
Err(_) => assert!(false),
|
||||
};
|
||||
|
||||
let _ = match ecr.pull(&image).unwrap() {
|
||||
@@ -147,9 +147,9 @@ fn test_build_cache() {
|
||||
|
||||
let _ = match local_docker.has_cache(&build_result.build) {
|
||||
Ok(CacheResult::Hit) => assert!(true),
|
||||
Ok(CacheResult::Miss(parent_build)) => assert!(false),
|
||||
Ok(CacheResult::Miss(_)) => assert!(false),
|
||||
Ok(CacheResult::MissWithoutParentBuild) => assert!(false),
|
||||
Err(err) => assert!(false),
|
||||
Err(_) => assert!(false),
|
||||
};
|
||||
|
||||
let start_pull_time = SystemTime::now();
|
||||
@@ -1111,6 +1111,10 @@ fn deploy_a_non_working_environment_with_a_working_failover_on_aws_eks() {
|
||||
fn deploy_2_non_working_environments_with_2_working_failovers_on_aws_eks() {
|
||||
init();
|
||||
|
||||
let test_name = function_name!();
|
||||
let span = span!(Level::INFO, "test", name = test_name);
|
||||
let _enter = span.enter();
|
||||
|
||||
let logger = logger();
|
||||
let secrets = FuncTestsSecrets::new();
|
||||
|
||||
|
||||
@@ -104,7 +104,7 @@ fn test_build_cache() {
|
||||
.expect("DIGITAL_OCEAN_TEST_CLUSTER_ID is not set"),
|
||||
);
|
||||
|
||||
let mut environment = test_utilities::common::working_minimal_environment(
|
||||
let environment = test_utilities::common::working_minimal_environment(
|
||||
&context,
|
||||
secrets
|
||||
.DEFAULT_TEST_DOMAIN
|
||||
@@ -120,9 +120,9 @@ fn test_build_cache() {
|
||||
let app_build = app.to_build();
|
||||
let _ = match local_docker.has_cache(&app_build) {
|
||||
Ok(CacheResult::Hit) => assert!(false),
|
||||
Ok(CacheResult::Miss(parent_build)) => assert!(true),
|
||||
Ok(CacheResult::Miss(_)) => assert!(true),
|
||||
Ok(CacheResult::MissWithoutParentBuild) => assert!(false),
|
||||
Err(err) => assert!(false),
|
||||
Err(_) => assert!(false),
|
||||
};
|
||||
|
||||
let _ = match docr.pull(&image).unwrap() {
|
||||
@@ -149,9 +149,9 @@ fn test_build_cache() {
|
||||
|
||||
let _ = match local_docker.has_cache(&build_result.build) {
|
||||
Ok(CacheResult::Hit) => assert!(true),
|
||||
Ok(CacheResult::Miss(parent_build)) => assert!(false),
|
||||
Ok(CacheResult::Miss(_)) => assert!(false),
|
||||
Ok(CacheResult::MissWithoutParentBuild) => assert!(false),
|
||||
Err(err) => assert!(false),
|
||||
Err(_) => assert!(false),
|
||||
};
|
||||
|
||||
let start_pull_time = SystemTime::now();
|
||||
|
||||
@@ -109,7 +109,7 @@ fn test_build_cache() {
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
let mut environment = test_utilities::common::working_minimal_environment(
|
||||
let environment = test_utilities::common::working_minimal_environment(
|
||||
&context,
|
||||
secrets
|
||||
.DEFAULT_TEST_DOMAIN
|
||||
@@ -125,9 +125,9 @@ fn test_build_cache() {
|
||||
let app_build = app.to_build();
|
||||
let _ = match local_docker.has_cache(&app_build) {
|
||||
Ok(CacheResult::Hit) => assert!(false),
|
||||
Ok(CacheResult::Miss(parent_build)) => assert!(true),
|
||||
Ok(CacheResult::Miss(_)) => assert!(true),
|
||||
Ok(CacheResult::MissWithoutParentBuild) => assert!(false),
|
||||
Err(err) => assert!(false),
|
||||
Err(_) => assert!(false),
|
||||
};
|
||||
|
||||
let _ = match scr.pull(&image).unwrap() {
|
||||
@@ -154,9 +154,9 @@ fn test_build_cache() {
|
||||
|
||||
let _ = match local_docker.has_cache(&build_result.build) {
|
||||
Ok(CacheResult::Hit) => assert!(true),
|
||||
Ok(CacheResult::Miss(parent_build)) => assert!(false),
|
||||
Ok(CacheResult::Miss(_)) => assert!(false),
|
||||
Ok(CacheResult::MissWithoutParentBuild) => assert!(false),
|
||||
Err(err) => assert!(false),
|
||||
Err(_) => assert!(false),
|
||||
};
|
||||
|
||||
let start_pull_time = SystemTime::now();
|
||||
|
||||
Reference in New Issue
Block a user