Skip to content
Merged
Show file tree
Hide file tree
Changes from 64 commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
2f04c66
Added pipeline to build embeddings
raych1 Mar 15, 2024
e50b402
Added secret variables
raych1 Mar 15, 2024
299a6f0
Updated nuget.config
raych1 Mar 15, 2024
b60c178
Updated yml file
raych1 Mar 15, 2024
b1cf977
Updated working directory in the script
raych1 Mar 15, 2024
f2e8a09
Fixed script issue
raych1 Mar 15, 2024
0e78a03
Fixed script name case issue
raych1 Mar 15, 2024
bcf84d1
Fixed working directory value
raych1 Mar 15, 2024
c0ca0c1
checkout enghub repo in yml
raych1 Mar 15, 2024
c951e6e
updated the enghub checkout url
raych1 Mar 15, 2024
f8efa44
Checkout sdk tools repo
raych1 Mar 15, 2024
48a9980
Updated repo folders
raych1 Mar 16, 2024
9ebadac
Updated path of script file
raych1 Mar 16, 2024
74dfa32
Fixed checkout path in the yml file
raych1 Mar 16, 2024
94328e6
Introduce common jobs to setup env
raych1 Mar 16, 2024
869776e
Fixed syntax error in yml
raych1 Mar 16, 2024
2040a01
Use stages
raych1 Mar 16, 2024
cd23ba4
Fixed dependsOn error
raych1 Mar 16, 2024
f5bb2dd
Fixed stage dependson
raych1 Mar 16, 2024
f1e45ef
Updated pipeline file name
raych1 Mar 16, 2024
00d8ff5
Added common pipeline template
raych1 Mar 16, 2024
a6ee185
Enable local run for the script
raych1 Mar 18, 2024
4802758
Added two more stages in pipeline yml
raych1 Mar 18, 2024
d5ce93b
Updated github repo url
raych1 Mar 18, 2024
acba8dd
Removed the typespec repo stage temporaily
raych1 Mar 18, 2024
93d1908
list python package installed
raych1 Mar 18, 2024
d69483a
Use python -m command
raych1 Mar 18, 2024
8185ce9
Print content of installed python packages
raych1 Mar 18, 2024
e42b504
Added debug script
raych1 Mar 18, 2024
ceb0471
Redirect log stream
raych1 Mar 18, 2024
0673c86
comment measure-command
raych1 Mar 18, 2024
136147d
Output log by variables
raych1 Mar 18, 2024
5a004ec
Use liunx agent pool
raych1 Mar 19, 2024
5eb4c1b
Added conda python env
raych1 Mar 19, 2024
e157841
Use pwsh shell
raych1 Mar 19, 2024
a7672b6
Use windows os
raych1 Mar 19, 2024
7cff8ac
Use pwsh step
raych1 Mar 19, 2024
37a019e
Use full path to conda
raych1 Mar 19, 2024
e3e7550
Added conda init
raych1 Mar 19, 2024
825b5fd
Run conda with full path
raych1 Mar 19, 2024
d774e31
Call python thur conda env
raych1 Mar 19, 2024
1968ac9
Install conda in pwsh
raych1 Mar 19, 2024
bfa51c1
debug variables
raych1 Mar 20, 2024
139b63f
Use conda for local run too
raych1 Mar 20, 2024
8ad1029
make blob container configurable
raych1 Mar 20, 2024
99b3620
Add missing ending brace
raych1 Mar 20, 2024
4013183
Added condapath to output
raych1 Mar 20, 2024
8946f61
Test without conda
raych1 Mar 22, 2024
489ab6f
Delete python version parameter
raych1 Mar 22, 2024
7ddf6e4
Use same source path when checkout one repo only
raych1 Mar 22, 2024
c571ac5
Remove conda env
raych1 Mar 25, 2024
9f7bc6b
Updated customizedDocEmbeddings script path
raych1 Mar 25, 2024
bad10b2
Use 3.11 python version
raych1 Mar 25, 2024
9b3f69b
Added typespec embeddings build step
raych1 Mar 25, 2024
800dae7
Remove conda call
raych1 Mar 25, 2024
5178dad
Use raw content url for downloading
raych1 Mar 26, 2024
f576de9
Removed python version setting
raych1 Mar 26, 2024
2065794
Recover the list of customized document
raych1 Mar 26, 2024
0f80819
Output key in metadata file
raych1 Mar 26, 2024
bb8ad60
Variables have to be set in env rather than .py
raych1 Mar 26, 2024
f76f5ff
Merge branch 'main' into user/raych1/add-pipeline
raych1 Mar 26, 2024
82ee5d9
Added codeowner
raych1 Mar 26, 2024
7dd251e
Disable pr trigger
raych1 Mar 26, 2024
d64c40b
Updated readme.md
raych1 Mar 26, 2024
ce79176
Remove renhe temporarily
raych1 Mar 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
/tools/perf-automation/ @mikeharder @benbp
/tools/pipeline-generator/ @weshaggard @benbp
/tools/pipeline-witness/ @praveenkuttappan @weshaggard
/tools/sdk-ai-bots/ @raych1 @lirenhe
/tools/sdk-generation-pipeline/ @weshaggard @praveenkuttappan @maririos
/tools/sdk-testgen/ @raych1 @tadelesh
/tools/test-proxy/ @scbedd @mikeharder
Expand Down
105 changes: 105 additions & 0 deletions tools/sdk-ai-bots/.pipelines/build-document-embeddings.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
trigger: none
pr: none

schedules:
- cron: "0 0 * * *"
displayName: Daily midnight build
branches:
include:
- main
always: true

pool:
vmImage: 'windows-latest'

variables:
st-account-name: $(storage-account-name)
st-container-name: $(storage-account-container)
aoai-endpoint: $(azure-openai-endpoint)
asch-endpoint: $(azure-search-endpoint)
asch-index-name: $(azure-search-index-name)
aoai-embedding-model: $(azure-openai-embedding-model)

parameters:
- name: incrementalEmbedding
displayName: 'Incremental Embedding Build?'
type: boolean
default: true

stages:
- stage: BuildEngHubDocEmbeddings
displayName: 'Build EngHub Document Embeddings'
jobs:
- job: BuildEngHubDocumentEmbeddings
steps:
- template: setup-pipeline.yml
- checkout: git://internal/_git/azure-sdk-docs-eng.ms
displayName: 'Checkout azure-sdk-docs-eng.ms repository'
- task: Powershell@2
inputs:
filePath: $(Build.SourcesDirectory)/azure-sdk-tools/tools/sdk-ai-bots/Scripts/Build-EngHubDocEmbeddings.ps1
arguments: >
-IncrementalEmbedding "${{ parameters.incrementalEmbedding }}"
pwsh: true
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run embeddings build script'
env:
AZURE_OPENAI_ENDPOINT: $(aoai-endpoint)
AZURE_SEARCH_ENDPOINT: $(asch-endpoint)
AZURE_SEARCH_INDEX_NAME: $(asch-index-name)
AZURE_OPENAI_EMBEDDING_MODEL: $(aoai-embedding-model)
AZURE_OPENAI_API_KEY: $(azure-openapi-key)
AZURE_SEARCH_KEY: $(azure-search-key)
AZURE_STORAGE_ACCOUNT_KEY: $(storage-account-key)
AZURE_STORAGE_ACCOUNT_NAME: $(st-account-name)
AZURE_STORAGE_ACCOUNT_CONTAINER: $(st-container-name)

- stage: BuildTypeSpecDocEmbeddings
displayName: 'Build TypeSpec Document Embeddings'
jobs:
- job: BuildTypeSpecDocumentEmbeddings
steps:
- template: setup-pipeline.yml
- task: Powershell@2
inputs:
filePath: $(Build.SourcesDirectory)/tools/sdk-ai-bots/Scripts/Build-TypeSpecAzureDocEmbeddings.ps1
arguments: >
-IncrementalEmbedding "${{ parameters.incrementalEmbedding }}"
pwsh: true
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run embeddings build script'
env:
AZURE_OPENAI_ENDPOINT: $(aoai-endpoint)
AZURE_SEARCH_ENDPOINT: $(asch-endpoint)
AZURE_SEARCH_INDEX_NAME: $(asch-index-name)
AZURE_OPENAI_EMBEDDING_MODEL: $(aoai-embedding-model)
AZURE_OPENAI_API_KEY: $(azure-openapi-key)
AZURE_SEARCH_KEY: $(azure-search-key)
AZURE_STORAGE_ACCOUNT_KEY: $(storage-account-key)
AZURE_STORAGE_ACCOUNT_NAME: $(st-account-name)
AZURE_STORAGE_ACCOUNT_CONTAINER: $(st-container-name)

- stage: BuildCustomizedDocEmbeddings
displayName: 'Build Customized Document Embeddings'
jobs:
- job: BuildCustomizedDocumentEmbeddings
steps:
- template: setup-pipeline.yml
- task: Powershell@2
inputs:
filePath: $(Build.SourcesDirectory)/tools/sdk-ai-bots/Scripts/Build-CustomizedDocEmbeddings.ps1
arguments: >
-IncrementalEmbedding "${{ parameters.incrementalEmbedding }}"
pwsh: true
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run embeddings build script'
env:
AZURE_OPENAI_ENDPOINT: $(aoai-endpoint)
AZURE_SEARCH_ENDPOINT: $(asch-endpoint)
AZURE_SEARCH_INDEX_NAME: $(asch-index-name)
AZURE_OPENAI_EMBEDDING_MODEL: $(aoai-embedding-model)
AZURE_OPENAI_API_KEY: $(azure-openapi-key)
AZURE_SEARCH_KEY: $(azure-search-key)
AZURE_STORAGE_ACCOUNT_KEY: $(storage-account-key)
AZURE_STORAGE_ACCOUNT_NAME: $(st-account-name)
AZURE_STORAGE_ACCOUNT_CONTAINER: $(st-container-name)
11 changes: 11 additions & 0 deletions tools/sdk-ai-bots/.pipelines/setup-pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
parameters:
- name: pythonVersion
type: string
default: '3.x'

steps:
- task: UsePythonVersion@0
inputs:
versionSpec: ${{ parameters.pythonVersion }}
addToPath: true
- checkout: self
4 changes: 2 additions & 2 deletions tools/sdk-ai-bots/AzureSdkQaBot/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Azure SDK Assistant
# Azure SDK Teams Assistant

## Summary

This is a conversational bot for Microsoft Teams that answers the question related to Azure rest api spec repo document and pull request review.
This is a conversational bot for Microsoft Teams that answers the question related to the Azure domain area.
23 changes: 21 additions & 2 deletions tools/sdk-ai-bots/Embeddings/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,23 @@
## Get Started
## How to Run This Tool
1. Run `pip install -r requirements.txt` to install requirements.
1. Set the environment variables that are defined in `settings/settings.py` in `.env` file.
1. Run `python main.py`.
1. Run `python main.py`.

## Environment Variables
INCREMENTAL_EMBEDDING: the option to build embedding incrementally.
METADATA_PATH: the file path of the metadata file which contains the document URL and title.
DOCUMENT_PATH: the folder path of the document which need to build embeddings.
RAG_CHUNK_PATH: the file path of the RAG chunk file which is the last version or just the file name if it doesn't exist.

AZURE_OPENAI_API_KEY: Azure OpenAI api key
AZURE_OPENAI_ENDPOINT: Azure OpenAI endpoint
AZURE_SEARCH_KEY: Azure search service key
AZURE_SEARCH_ENDPOINT: Azure serach service endpoint
AZURE_SEARCH_INDEX_NAME: Azure serach service index name
AZURE_OPENAI_EMBEDDING_MODEL: the deployed model name in Azure OpenAI service

##### DO NOT CHANGE BELOW VARIABLES' VALUE
AZURESEARCH_FIELDS_CONTENT=Text
AZURESEARCH_FIELDS_CONTENT_VECTOR=Embedding
AZURESEARCH_FIELDS_TAG=AdditionalMetadata
AZURESEARCH_FIELDS_ID=Id
4 changes: 0 additions & 4 deletions tools/sdk-ai-bots/Embeddings/embedding/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@

class Embedding:
def __init__(self):
os.environ["AZURESEARCH_FIELDS_CONTENT"] = "Text"
os.environ["AZURESEARCH_FIELDS_CONTENT_VECTOR"] = "Embedding"
os.environ["AZURESEARCH_FIELDS_TAG"] = "AdditionalMetadata"
os.environ["AZURESEARCH_FIELDS_ID"] = "Id"

embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
openai_api_type="azure",
Expand Down
37 changes: 36 additions & 1 deletion tools/sdk-ai-bots/README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,37 @@
# sdk-ai-bots
## Overview
This folder contains a collection of tools that utilize AI techniques.

#### AzureSdkQaBot
AzureSdkQaBot is a Teams bot which can answer the questions related to the Azure SDK domain. It is written in C#.

#### Embeddings
It is a tool written in Python that uses `langchain` library to create embeddings in Azure Search Service.

#### Scripts
This folder contains some scripts to build embeddings by calling the `Embeddings` tool.


## How to Refresh the Document Embeddings Used by Teams Bot
We have an [Azure DevOps pipeline](https://dev.azure.com/azure-sdk/internal/_build?definitionId=6811&_a=summary) which can help create or refresh the embeddings.

1. This pipeline contains three stages:
- Build EngHub Document Embeddings
This stage builds embeddings for all the documents under the [engineering hub site](https://dev.azure.com/azure-sdk/internal/_git/azure-sdk-docs-eng.ms?path=/docs)
- Build TypeSpec Document Embeddings
This stage builds embeddings for all the documents under the [typespec-azure site](https://github.com/Azure/typespec-azure)
- Build Customized Document Embeddings
This stage builds embeddings for some markdown documents which are publicly accessible.

2. The user can select specific stages when running the pipeline. By default, all three stages are included.

3. The pipeline has an option to refresh the embeddings incrementally. By default, `Incremental Embedding Build` is selected when the pipeline is triggered. If the user wants to create embeddings from scratch, they should unselect this option when triggering the pipeline.

### How to Add a New Document to the Customized Document List
If you have a publicly accessbile markdown file that you want the Teams bot to understand, you can add the information to [this file](https://github.com/Azure/azure-sdk-tools/blob/main/tools/sdk-ai-bots/Embeddings/settings/metadata_customized_docs.json) in the following format.
```JSON
"ci-fix.md": {
"title": "CI Fix Guide",
"url": "https://github.com/Azure/azure-rest-api-specs/blob/main/documentation/ci-fix.md"
}
```
This file is a `JSON`, and you must ensure that the `key` in this `JSON` is not duplicated when adding a new document.
33 changes: 22 additions & 11 deletions tools/sdk-ai-bots/Scripts/Build-CustomizedDocEmbeddings.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -52,28 +52,30 @@ function Download-GitHubFile {
}

try {
Invoke-WebRequest -Uri $url -OutFile $DestinationFilePath
Invoke-WebRequest -Uri $FileUrl -OutFile $DestinationFilePath
Write-Host "File downloaded successfully to: $DestinationFilePath"
}
catch {
Write-Error "Failed to download file from GitHub: $url"
Write-Error "Failed to download file from GitHub: $FileUrl"
exit 1
}
}


$workingDirectory = Get-Location
if($env:AGENT_ID) {
$workingDirectory = $(System.DefaultWorkingDirectory)
}
$workingDirectory = Join-Path $workingDirectory "tools\sdk-ai-bots"
# Set the working directory, current location is supposed to be the root of the repository
$buildSourceDirectory = Get-Location
$workingDirectory = Join-Path $buildSourceDirectory "tools\sdk-ai-bots"
$scriptsRoot = Join-Path $workingDirectory "Scripts"
$embeddingToolFolder = Join-Path $workingDirectory "Embeddings"

. (Join-Path $scriptsRoot common.ps1)

Write-Host "scriptsRoot: $scriptsRoot"
Write-Host "embeddingToolFolder: $embeddingToolFolder"
. (Join-Path $scriptsRoot Common.ps1)

# Install Az.Storage module
if (-not (Get-Module -ListAvailable -Name Az.Storage)) {
Install-Module -Name Az.Storage -Force -AllowClobber -Scope CurrentUser
}

# Create embeddingSource folder on current location
$embeddingSourceFolder = Join-Path -Path $workingDirectory -ChildPath "embeddingSource"
Expand Down Expand Up @@ -106,11 +108,15 @@ else {
}

# Download previous saved embeddings(last_rag_chunks_customized_docs.json) from Azure Blob Storage
$storageAccountName = "saazuresdkbot"
$containerName = "rag-contents"
$blobName = "last_rag_chunks_customized_docs.json"
$destinationPath = $embeddingSourceFolder
$ragChunkPath = Join-Path -Path $embeddingSourceFolder -ChildPath $blobName
$storageAccountName = $env:AZURE_STORAGE_ACCOUNT_NAME
$containerName = $env:AZURE_STORAGE_ACCOUNT_CONTAINER
if(-not $containerName) {
Write-Error "Please set the environment variable 'AZURE_STORAGE_ACCOUNT_CONTAINER'."
exit 1
}
if($IncrementalEmbedding -eq $true) {
Write-Host "Downloading previous saved embeddings $blobName from Azure Blob Storage"
if(-not (Download-AzureBlob -StorageAccountName $storageAccountName -ContainerName $containerName -BlobName $blobName -DestinationPath $destinationPath)) {
Expand All @@ -124,6 +130,11 @@ $env:RAG_CHUNK_PATH = $ragChunkPath
$env:METADATA_PATH = $customizedDocsMetadataFile
$env:DOCUMENT_PATH = $customizedDocsDestFolder
$env:INCREMENTAL_EMBEDDING = $IncrementalEmbedding
$env:AZURESEARCH_FIELDS_CONTENT = "Text"
$env:AZURESEARCH_FIELDS_CONTENT_VECTOR = "Embedding"
$env:AZURESEARCH_FIELDS_TAG = "AdditionalMetadata"
$env:AZURESEARCH_FIELDS_ID = "Id"

if(-not (Build-Embeddings -EmbeddingToolFolder $embeddingToolFolder)) {
exit 1
}
Expand Down
53 changes: 34 additions & 19 deletions tools/sdk-ai-bots/Scripts/Build-EngHubDocEmbeddings.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,23 @@ param (
[ValidateNotNullOrEmpty()]
[string] $IncrementalEmbedding = $true
)

$workingDirectory = Get-Location
# Set the working directory, current location is supposed to be the root of the repository
$buildSourceDirectory = Get-Location
$workingDirectory = Join-Path $buildSourceDirectory "tools\sdk-ai-bots"
if($env:AGENT_ID) {
$workingDirectory = $(System.DefaultWorkingDirectory)
# Running in Azure DevOps, pipeline would checkout two repositories, azure-sdk-tools and enginerring hub repository, so the working directory should be azure-sdk-tools
$workingDirectory = Join-Path $buildSourceDirectory "azure-sdk-tools\tools\sdk-ai-bots"
}
$workingDirectory = Join-Path $workingDirectory "tools\sdk-ai-bots"
$scriptsRoot = Join-Path $workingDirectory "Scripts"
$embeddingToolFolder = Join-Path $workingDirectory "Embeddings"

. (Join-Path $scriptsRoot common.ps1)

Write-Host "scriptsRoot: $scriptsRoot"
Write-Host "embeddingToolFolder: $embeddingToolFolder"
. (Join-Path $scriptsRoot Common.ps1)

# Create 'repos' folder on current location
$reposFolder = Join-Path -Path $workingDirectory -ChildPath "repos"
if (-not (Test-Path -Path $reposFolder)) {
New-Item -ItemType Directory -Path $reposFolder
}

# Clone azure-sdk-docs-eng.ms repository
Write-Host "Cloning azure-sdk-docs-eng.ms repository at $reposFolder"
if(-not (Clone-Repository -RepoUrl "https://[email protected]/azure-sdk/internal/_git/azure-sdk-docs-eng.ms" -RootFolder $reposFolder)) {
exit 1
# Install Az.Storage module
if (-not (Get-Module -ListAvailable -Name Az.Storage)) {
Install-Module -Name Az.Storage -Force -AllowClobber -Scope CurrentUser
}

# Create embeddingSource folder on current location
Expand All @@ -55,7 +48,19 @@ if (-not (Test-Path -Path $enghubDocsDestFolder)) {
New-Item -ItemType Directory -Path $enghubDocsDestFolder
}

$enghubDocsSrcFolder = Join-Path -Path $reposFolder -ChildPath "azure-sdk-docs-eng.ms/docs"
$reposFolder = Join-Path -Path $buildSourceDirectory -ChildPath "azure-sdk-docs-eng.ms"
if(-not (Test-Path $reposFolder)) {
# Clone eng hub repository
Write-Host "Cloning azure-sdk-docs-eng.ms repository at $buildSourceDirectory"
if(-not (Clone-Repository -RepoUrl "https://[email protected]/azure-sdk/internal/_git/azure-sdk-docs-eng.ms" -RootFolder $buildSourceDirectory)) {
exit 1
}
}
$enghubDocsSrcFolder = Join-Path -Path $buildSourceDirectory -ChildPath "azure-sdk-docs-eng.ms/docs"
if(-not (Test-Path $enghubDocsSrcFolder)) {
Write-Error "Failed to find the enghub documents folder at $enghubDocsSrcFolder"
exit 1
}

# Call the script to build the metadata.json file
Write-Host "Building metadata.json file for enghub documents"
Expand All @@ -71,11 +76,15 @@ else {
}

# Download previous saved embeddings(last_rag_chunks_enghub_docs.json) from Azure Blob Storage
$storageAccountName = "saazuresdkbot"
$containerName = "rag-contents"
$blobName = "last_rag_chunks_enghub_docs.json"
$destinationPath = $embeddingSourceFolder
$ragChunkPath = Join-Path -Path $embeddingSourceFolder -ChildPath $blobName
$storageAccountName = $env:AZURE_STORAGE_ACCOUNT_NAME
$containerName = $env:AZURE_STORAGE_ACCOUNT_CONTAINER
if(-not $containerName) {
Write-Error "Please set the environment variable 'AZURE_STORAGE_ACCOUNT_CONTAINER'."
exit 1
}
if($IncrementalEmbedding -eq $true) {
Write-Host "Downloading previous saved embeddings $blobName from Azure Blob Storage"
if(-not (Download-AzureBlob -StorageAccountName $storageAccountName -ContainerName $containerName -BlobName $blobName -DestinationPath $destinationPath)) {
Expand All @@ -89,6 +98,12 @@ $env:RAG_CHUNK_PATH = $ragChunkPath
$env:METADATA_PATH = "$embeddingSourceFolder/metadata_enghub_docs.json"
$env:DOCUMENT_PATH = $enghubDocsDestFolder
$env:INCREMENTAL_EMBEDDING = $IncrementalEmbedding

$env:AZURESEARCH_FIELDS_CONTENT = "Text"
$env:AZURESEARCH_FIELDS_CONTENT_VECTOR = "Embedding"
$env:AZURESEARCH_FIELDS_TAG = "AdditionalMetadata"
$env:AZURESEARCH_FIELDS_ID = "Id"

if(-not (Build-Embeddings -EmbeddingToolFolder $embeddingToolFolder)) {
exit 1
}
Expand Down
Loading