Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
84b5a39
add pester tests for k8s-extension
Mar 13, 2025
feb18f1
fix testcases for nodepool image issues (#5)
bavneetsingh16 May 23, 2025
c39d105
update readme and version release notes (#6)
bavneetsingh16 May 23, 2025
a9ecd65
fix: simplify logic and enable correct recording rule groups for mana…
bragi92 May 29, 2025
4683050
update readme and version release notes (#6)
bavneetsingh16 May 23, 2025
cb0788d
fix: simplify logic and enable correct recording rule groups for mana…
bragi92 May 29, 2025
bb96daf
Extend ContainerInsights Extension for high log scale mode support (#9)
wanlonghenry Jul 22, 2025
91618ab
update python version to 3.13 (#10)
bavneetsingh16 Jul 31, 2025
f652dac
add pester tests for k8s-extension
Mar 13, 2025
da67e10
fix testcases for nodepool image issues (#5)
bavneetsingh16 May 23, 2025
2f18092
update readme and version release notes (#6)
bavneetsingh16 May 23, 2025
5270054
fix: simplify logic and enable correct recording rule groups for mana…
bragi92 May 29, 2025
7cd1554
update readme and version release notes (#6)
bavneetsingh16 May 23, 2025
68513b4
fix: simplify logic and enable correct recording rule groups for mana…
bragi92 May 29, 2025
605bd21
Extend ContainerInsights Extension for high log scale mode support (#9)
wanlonghenry Jul 22, 2025
e2a51e1
update python version to 3.13 (#10)
bavneetsingh16 Jul 31, 2025
e73f4f4
Merge branch 'main' of https://github.com/AzureArcForKubernetes/k8s-e…
andborja Aug 6, 2025
94bb278
Merge branch 'main' of https://github.com/AzureArcForKubernetes/k8s-e…
andborja Aug 20, 2025
c586523
Add k8s-extension troubleshoot.
andborja Aug 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Extend ContainerInsights Extension for high log scale mode support (#9)
  • Loading branch information
wanlonghenry authored and andborja committed Aug 6, 2025
commit 605bd21435a7e2f492f28561282b152ae46dd216
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,21 @@
logger = get_logger(__name__)
DCR_API_VERSION = "2022-06-01"

ContainerInsightsStreams = [
"Microsoft-ContainerLog",
"Microsoft-ContainerLogV2-HighScale",
"Microsoft-KubeEvents",
"Microsoft-KubePodInventory",
"Microsoft-KubeNodeInventory",
"Microsoft-KubePVInventory",
"Microsoft-KubeServices",
"Microsoft-KubeMonAgentEvents",
"Microsoft-InsightsMetrics",
"Microsoft-ContainerInventory",
"Microsoft-ContainerNodeInventory",
"Microsoft-Perf",
]


class ContainerInsights(DefaultExtension):
def Create(self, cmd, client, resource_group_name, cluster_name, name, cluster_type, cluster_rp,
Expand Down Expand Up @@ -83,6 +98,7 @@ def Delete(self, cmd, client, resource_group_name, cluster_name, name, cluster_t
# Delete DCR-A if it exists incase of MSI Auth
useAADAuth = False
isDCRAExists = False
enable_high_log_scale_mode = False
cluster_rp, _ = get_cluster_rp_api_version(cluster_type=cluster_type, cluster_rp=cluster_rp)
try:
extension = client.get(resource_group_name, cluster_rp, cluster_type, cluster_name, name)
Expand All @@ -95,10 +111,15 @@ def Delete(self, cmd, client, resource_group_name, cluster_name, name, cluster_t
return

subscription_id = get_subscription_id(cmd.cli_ctx)
resources = cf_resources(cmd.cli_ctx, subscription_id)
# handle cluster type here
cluster_resource_id = '/subscriptions/{0}/resourceGroups/{1}/providers/{2}/{3}/{4}'.format(subscription_id, resource_group_name, cluster_rp, cluster_type, cluster_name)
workspace_resource_id = None
if (extension is not None) and (extension.configuration_settings is not None):
configSettings = extension.configuration_settings
# Extract workspace resource ID if present
if 'logAnalyticsWorkspaceResourceID' in configSettings:
workspace_resource_id = configSettings['logAnalyticsWorkspaceResourceID']
# omsagent is being renamed to ama-logs. Check for both for compatibility
if 'omsagent.useAADAuth' in configSettings:
useAADAuthSetting = configSettings['omsagent.useAADAuth']
Expand All @@ -108,6 +129,16 @@ def Delete(self, cmd, client, resource_group_name, cluster_name, name, cluster_t
useAADAuthSetting = configSettings['amalogs.useAADAuth']
if (isinstance(useAADAuthSetting, str) and str(useAADAuthSetting).lower() == "true") or (isinstance(useAADAuthSetting, bool) and useAADAuthSetting):
useAADAuth = True

# Check if high log scale mode was enabled
if useAADAuth and 'amalogs.enableHighLogScaleMode' in configSettings:
highLogScaleSetting = configSettings['amalogs.enableHighLogScaleMode']
if isinstance(highLogScaleSetting, str):
enable_high_log_scale_mode = (highLogScaleSetting.lower() == "true")
elif isinstance(highLogScaleSetting, bool):
enable_high_log_scale_mode = highLogScaleSetting
else:
raise InvalidArgumentValueError('amalogs.enableHighLogScaleMode value MUST be either true/false or boolean type')
if useAADAuth:
association_url = cmd.cli_ctx.cloud.endpoints.resource_manager + f"{cluster_resource_id}/providers/Microsoft.Insights/dataCollectionRuleAssociations/ContainerInsightsExtension?api-version={DCR_API_VERSION}"
for _ in range(3):
Expand All @@ -131,6 +162,41 @@ def Delete(self, cmd, client, resource_group_name, cluster_name, name, cluster_t
except Exception:
pass # its OK to ignore the exception since MSI auth in preview

if useAADAuth:
# Get the workspace region if workspace_resource_id is available
workspace_region = None
if workspace_resource_id:
try:
workspace_resource = resources.get_by_id(workspace_resource_id, '2015-11-01-preview')
workspace_region = workspace_resource.location.replace(" ", "").lower()
except Exception as ex:
logger.warning("Skipping DCR and DCE deletion due to inability to determine workspace region")
return
# If workspace_region still couldn't be determined, skip deletion
if not workspace_region:
logger.warning("Workspace region could not be determined. Skipping DCR and DCE deletion.")
return

# Use workspace_region for DCR name to match creation logic
dcr_name = f"MSCI-{workspace_region}-{cluster_name}"
dcr_name = dcr_name[0:64]

dcr_resource_id = f"/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.Insights/dataCollectionRules/{dcr_name}"
dcr_url = cmd.cli_ctx.cloud.endpoints.resource_manager + f"{dcr_resource_id}?api-version={DCR_API_VERSION}"
response = send_raw_request(cmd.cli_ctx, "GET", dcr_url)
dcr_config = json.loads(response.text)
# Delete the DCR
for _ in range(3):
try:
send_raw_request(cmd.cli_ctx, "DELETE", dcr_url,)
logger.info(f"Successfully deleted DCR: {dcr_name}")
break
except Exception as ex:
logger.warning(f"Error deleting DCR: {str(ex)}")
pass

if enable_high_log_scale_mode:
_delete_dce_for_dcr(cmd, subscription_id, resource_group_name, dcr_config)

# Custom Validation Logic for Container Insights

Expand Down Expand Up @@ -464,6 +530,7 @@ def _get_container_insights_settings(cmd, cluster_resource_group_name, cluster_r
subscription_id = get_subscription_id(cmd.cli_ctx)
workspace_resource_id = ''
useAADAuth = True
enableHighLogScaleMode = False # Default value
if 'amalogs.useAADAuth' not in configuration_settings:
configuration_settings['amalogs.useAADAuth'] = "true"
extensionSettings = {}
Expand Down Expand Up @@ -520,6 +587,16 @@ def _get_container_insights_settings(cmd, cluster_resource_group_name, cluster_r
raise InvalidArgumentValueError('streams must be an array type')
extensionSettings["dataCollectionSettings"] = dataCollectionSettings

if useAADAuth and 'amalogs.enableHighLogScaleMode' in configuration_settings:
enableHighLogScaleMode = configuration_settings['amalogs.enableHighLogScaleMode']
if isinstance(enableHighLogScaleMode, str):
enableHighLogScaleMode_str = enableHighLogScaleMode.lower()
if enableHighLogScaleMode_str not in ["true", "false"]:
raise InvalidArgumentValueError('amalogs.enableHighLogScaleMode value MUST be either true or false')
enableHighLogScaleMode = (enableHighLogScaleMode_str == "true")
elif not isinstance(enableHighLogScaleMode, bool):
raise InvalidArgumentValueError('amalogs.enableHighLogScaleMode value MUST be either true or false')

workspace_resource_id = workspace_resource_id.strip()

if configuration_protected_settings is not None:
Expand Down Expand Up @@ -548,7 +625,7 @@ def _get_container_insights_settings(cmd, cluster_resource_group_name, cluster_r
if is_ci_extension_type:
if useAADAuth:
logger.info("creating data collection rule and association")
_ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_resource_group_name, cluster_rp, cluster_type, cluster_name, workspace_resource_id, extensionSettings)
_ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_resource_group_name, cluster_rp, cluster_type, cluster_name, workspace_resource_id, extensionSettings, enableHighLogScaleMode)
elif not _is_container_insights_solution_exists(cmd, workspace_resource_id):
logger.info("Creating ContainerInsights solution resource, since it doesn't exist and it is using legacy authentication")
_ensure_container_insights_for_monitoring(cmd, workspace_resource_id).result()
Expand Down Expand Up @@ -597,6 +674,37 @@ def _get_container_insights_settings(cmd, cluster_resource_group_name, cluster_r
configuration_settings['amalogs.domain'] = 'opinsights.azure.microsoft.scloud'


def _delete_dce_for_dcr(cmd, subscription_id, cluster_resource_group_name, dcr_config):
"""Delete Data Collection Endpoint associated with a DCR if it exists"""
try:
if ("properties" in dcr_config and
"dataCollectionEndpointId" in dcr_config["properties"] and
dcr_config["properties"]["dataCollectionEndpointId"]):

dce_id = dcr_config["properties"]["dataCollectionEndpointId"]
dce_parts = dce_id.split('/')

if len(dce_parts) > 0:
dce_name = dce_parts[-1]
dce_resource_id = f"/subscriptions/{subscription_id}/resourceGroups/{cluster_resource_group_name}/providers/Microsoft.Insights/dataCollectionEndpoints/{dce_name}"
dce_url = cmd.cli_ctx.cloud.endpoints.resource_manager + f"{dce_resource_id}?api-version=2022-06-01"
# Try to delete up to 3 times
for retry in range(3):
try:
send_raw_request(cmd.cli_ctx, "DELETE", dce_url)
logger.info("Successfully deleted DCE: %s", dce_name)
return True
except CLIError as e:
if "ResourceNotFound" in str(e):
return True
if retry == 2:
logger.warning("Failed to delete DCE: %s - %s", dce_name, str(e))
return False
logger.info("Retrying DCE deletion after error: %s", str(e))
except CLIError:
pass
return True

def get_existing_container_insights_extension_dcr_tags(cmd, dcr_url):
tags = {}
_MAX_RETRY_TIMES = 3
Expand All @@ -617,7 +725,7 @@ def get_existing_container_insights_extension_dcr_tags(cmd, dcr_url):
return tags


def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_resource_group_name, cluster_rp, cluster_type, cluster_name, workspace_resource_id, extensionSettings):
def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_resource_group_name, cluster_rp, cluster_type, cluster_name, workspace_resource_id, extensionSettings, enable_high_log_scale_mode):
from azure.core.exceptions import HttpResponseError

cluster_region = ''
Expand Down Expand Up @@ -652,6 +760,18 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
dataCollectionRuleName = dataCollectionRuleName[0:64]
dcr_resource_id = f"/subscriptions/{subscription_id}/resourceGroups/{cluster_resource_group_name}/providers/Microsoft.Insights/dataCollectionRules/{dataCollectionRuleName}"

# ingestion DCE MUST be in workspace region
ingestionDataCollectionEndpointName = f"MSCI-ingest-{workspace_region}-{cluster_name}"
# Max length of the DCE name is 43 chars
ingestionDataCollectionEndpointName = _trim_suffix_if_needed(ingestionDataCollectionEndpointName[0:43])
ingestion_dce_resource_id = None

# create ingestion DCE if high log scale mode enabled
if enable_high_log_scale_mode:
ingestion_dce_resource_id = create_data_collection_endpoint(
cmd, subscription_id, cluster_resource_group_name, workspace_region, ingestionDataCollectionEndpointName
)

# first get the association between region display names and region IDs (because for some reason
# the "which RPs are available in which regions" check returns region display names)
region_names_to_id = {}
Expand All @@ -677,6 +797,8 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
# get existing tags on the container insights extension DCR if the customer added any
existing_tags = get_existing_container_insights_extension_dcr_tags(cmd, dcr_url)
streams = ["Microsoft-ContainerInsights-Group-Default"]
if enable_high_log_scale_mode:
streams = ContainerInsightsStreams
if extensionSettings is None:
extensionSettings = {}
if 'dataCollectionSettings' in extensionSettings.keys():
Expand All @@ -691,6 +813,11 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
}
extensionSettings["dataCollectionSettings"] = dataCollectionSettings

if enable_high_log_scale_mode:
for i, v in enumerate(streams):
if v == "Microsoft-ContainerLogV2":
streams[i] = "Microsoft-ContainerLogV2-HighScale"

# create the DCR
dcr_creation_body = json.dumps(
{
Expand Down Expand Up @@ -722,6 +849,7 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
}
]
},
"dataCollectionEndpointId": ingestion_dce_resource_id
},
}
)
Expand Down Expand Up @@ -755,3 +883,34 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
error = e
else:
raise error


def create_data_collection_endpoint(cmd, subscription_id, cluster_resource_group_name, workspace_region, ingestionDataCollectionEndpointName):
# create the ingestion DCE
ingestion_dce_resource_id = f"/subscriptions/{subscription_id}/resourceGroups/{cluster_resource_group_name}/providers/Microsoft.Insights/dataCollectionEndpoints/{ingestionDataCollectionEndpointName}"
ingestion_dce_url = cmd.cli_ctx.cloud.endpoints.resource_manager + f"{ingestion_dce_resource_id}?api-version=2022-06-01"
ingestion_dce_creation_body = json.dumps({
"location": workspace_region,
"kind": "Linux",
"properties": {
"networkAcls": {
"publicNetworkAccess": "Enabled"
}
}
})
error = None
for _ in range(3):
try:
send_raw_request(cmd.cli_ctx, "PUT", ingestion_dce_url, body=ingestion_dce_creation_body)
return ingestion_dce_resource_id
except AzCLIError as e:
error = e
if error:
raise error
return ingestion_dce_resource_id


def _trim_suffix_if_needed(s, suffix="-"):
if s.endswith(suffix):
s = s[:-len(suffix)]
return s
4 changes: 4 additions & 0 deletions testing/pipeline/k8s-custom-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ stages:
parameters:
jobName: AzureMonitor
path: ./test/extensions/public/AzureMonitor.Tests.ps1
- template: ./templates/run-test.yml
parameters:
jobName: AzureMonitorHighScale
path: ./test/extensions/public/AzureMonitorHighScale.Tests.ps1
- template: ./templates/run-test.yml
parameters:
jobName: AzurePolicy
Expand Down
75 changes: 75 additions & 0 deletions testing/test/extensions/public/AzureMonitorHighScale.Tests.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
Describe 'Azure Monitor High Scale Mode Testing' {
BeforeAll {
$extensionType = "microsoft.azuremonitor.containers"
$extensionName = "azuremonitor-containers"
$extensionAgentName = "omsagent"
$extensionAgentNamespace = "kube-system"

. $PSScriptRoot/../../helper/Constants.ps1
. $PSScriptRoot/../../helper/Helper.ps1
}

It 'Creates the extension with high log scale mode and checks that it onboards correctly' {
az $Env:K8sExtensionName create -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) `
--cluster-type connectedClusters --extension-type $extensionType -n $extensionName `
--configuration-settings "amalogs.enableHighLogScaleMode=true" --no-wait
$? | Should -BeTrue

$output = az $Env:K8sExtensionName show -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters -n $extensionName
$? | Should -BeTrue

$extension = ($output | ConvertFrom-Json)
$isAutoUpgradeMinorVersion = $extension.autoUpgradeMinorVersion
$isAutoUpgradeMinorVersion.ToString() -eq "True" | Should -BeTrue

# Verify high scale mode configuration
$settings = $extension.configurationSettings
$settings.'amalogs.enableHighLogScaleMode' | Should -Be "true"

# Loop and retry until the extension installs
$n = 0
do
{
if (Has-ExtensionData $extensionName) {
break
}
Start-Sleep -Seconds 10
$n += 1
} while ($n -le $MAX_RETRY_ATTEMPTS)
$n | Should -BeLessOrEqual $MAX_RETRY_ATTEMPTS
}

It "Performs a show on the extension" {
$output = az $Env:K8sExtensionName show -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters -n $extensionName
$? | Should -BeTrue
$output | Should -Not -BeNullOrEmpty
}

It "Lists the extensions on the cluster" {
$output = az $Env:K8sExtensionName list -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters
$? | Should -BeTrue

$output | Should -Not -BeNullOrEmpty
$extensionExists = $output | ConvertFrom-Json | Where-Object { $_.extensionType -eq $extensionType }
$extensionExists | Should -Not -BeNullOrEmpty
}

It "Deletes the extension from the cluster" {
$output = az $Env:K8sExtensionName delete -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters -n $extensionName --force
$? | Should -BeTrue

# Extension should not be found on the cluster
$output = az $Env:K8sExtensionName show -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters -n $extensionName
$? | Should -BeFalse
$output | Should -BeNullOrEmpty
}

It "Performs another list after the delete" {
$output = az $Env:K8sExtensionName list -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters
$? | Should -BeTrue
$output | Should -Not -BeNullOrEmpty

$extensionExists = $output | ConvertFrom-Json | Where-Object { $_.extensionType -eq $extensionName }
$extensionExists | Should -BeNullOrEmpty
}
}