diff --git a/README.md b/README.md
index 88a3fb78..91be5a16 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,79 @@
+# Another Sage Attention Windows Fork (Sage Attention 3)
+
+This is **another Sage Attention Windows fork**, adding support for **Sage Attention 3**.
+
+It is based on the official Sage Attention repository, with **minimal changes**: the goal is only to fix Windows-specific build issues and provide a Windows build with **feature parity** to upstream Sage Attention.
+
+---
+
+## What This Fork Provides
+
+- Based on **official Sage Attention** with the smallest possible patch set.
+- Only changes related to:
+  - Windows build fixes
+  - Packaging / wheel generation
+- No new features are added on top of upstream; the functionality should be **identical** to the official project, just **buildable and usable on Windows**.
+
+Due to limitations of my local environment, I can currently only provide wheels for:
+
+- **PyTorch:** 2.6.0 – 2.9.1  
+- **Python:** 3.11 – 3.13  
+
+---
+
+## CUDA Architectures
+
+CUDA architectures are kept **aligned with the official Windows PyTorch builds**:
+
+- During build, the script calls:
+
+  ```python
+  torch.cuda.get_arch_list()
+  ```
+
+- The result is used to populate the `TORCH_CUDA_ARCH_LIST` environment variable.
+- This keeps Sage Attention’s supported architectures **as close as possible** to those of the installed PyTorch wheel.
+
+However, the **upstream constraints still apply** and are **not** bypassed in this fork:
+
+- **Sage 2++** does **not** support architectures **below 8.0**.
+- **Sage 3** only supports architectures **below 10.0**.
+- This fork does **not** extend or change those architecture limits.
+
+---
+
+## Why Does This Fork Exist?
+
+Sage Attention is a great project: it uses **quantized attention** to achieve significant speedups while keeping the CUDA kernels relatively straightforward (especially compared to something like FlashAttention). For me, it’s a **must-have attention implementation for ComfyUI**.
+
+Unfortunately, I am a **Windows user**, and Sage Attention’s Windows support is, as many people know, quite poor.
+
+Thankfully, there are some excellent forks out there, especially **woct0rdho**’s Windows fork. That fork:
+
+- Ships Windows wheels
+- Fixes multiple platform-specific issues
+- In my experience, is sometimes even **more stable** than the original upstream
+
+I have been using their wheels for a long time and absolutely love that branch.
+
+Then **Sage Attention 3** arrived — and that’s where the problems started.
+
+woct0rdho used **Python abi3** wheels to avoid version hell, which is a brilliant idea. However, Sage Attention 3 seems to rely on some new pieces that make it impossible (or at least very difficult) to keep using abi3 for building the wheels.
+
+That was pretty frustrating.
+
+Fortunately, I managed to get Sage Attention 3 building locally on Windows after some not-too-complicated fixes (huge thanks to:
+
+- woct0rdho’s fork, and  
+- relevant PRs from **pamparamm**  
+
+which were very helpful).
+
+Given the current level of Windows support in the main Sage Attention project, I felt it would be a shame to keep this to myself — so I decided to publish this fork and share the Windows builds.
+
+The following is the original Readme
+---
+
 # SageAttention
 <!-- We are continuously updating more features. You could **Star** and **Watch** our repository to stay updated.
 
diff --git a/csrc/qattn/qk_int_sv_f8_cuda_sm90.cu b/csrc/qattn/qk_int_sv_f8_cuda_sm90.cu
index e9e5ccf5..04996f58 100644
--- a/csrc/qattn/qk_int_sv_f8_cuda_sm90.cu
+++ b/csrc/qattn/qk_int_sv_f8_cuda_sm90.cu
@@ -123,10 +123,14 @@ __device__ __forceinline__ void arrive(uint64_t* bar) {
     );
 }
 
+struct alignas(128) DeviceTensorMaps {
+  CUtensorMap q;
+  CUtensorMap k;
+  CUtensorMap v;
+};
+
 template<uint32_t CTA_Q, uint32_t CTA_K, uint32_t NUM_THREADS, uint32_t head_dim, QuantGranularity Q_GRAN, QuantGranularity K_GRAN, typename DTypeOut, MaskMode mask_mode = MaskMode::kNone, bool return_lse = false, bool fuse_v_scale=false>
-__global__ void qk_int8_sv_f8_attn_kernel(const __grid_constant__ CUtensorMap tensorMapQ, 
-                                        const __grid_constant__ CUtensorMap tensorMapK,
-                                        const __grid_constant__ CUtensorMap tensorMapV,
+__global__ void qk_int8_sv_f8_attn_kernel(const DeviceTensorMaps* __restrict__ tma_maps,
                                         float *__restrict__ Q_scale, float *__restrict__ K_scale, float *__restrict__ V_scale,
                                         DTypeOut* O, float *__restrict__ Lse, uint32_t stride_bz_o, uint32_t stride_h_o, uint32_t stride_seq_o,
                                         const uint32_t qo_len, const uint32_t kv_len, const uint32_t num_kv_groups,
@@ -134,6 +138,10 @@ __global__ void qk_int8_sv_f8_attn_kernel(const __grid_constant__ CUtensorMap te
 {
   static_assert(NUM_THREADS == 128);
   static_assert(CTA_Q <= CTA_K);
+
+  const CUtensorMap* tensorMapQ = &tma_maps->q;
+  const CUtensorMap* tensorMapK = &tma_maps->k;
+  const CUtensorMap* tensorMapV = &tma_maps->v;
   
   const uint32_t warp_idx = (threadIdx.x % 128) / 32;
   const uint32_t lane_id = threadIdx.x % 32;
@@ -238,9 +246,9 @@ __global__ void qk_int8_sv_f8_attn_kernel(const __grid_constant__ CUtensorMap te
     expect_bytes<(CTA_Q * head_dim) * sizeof(int8_t)>(&barrier_Q);
     expect_bytes<(CTA_K * head_dim) * sizeof(int8_t)>(&barrier_K);
     expect_bytes<(CTA_K * head_dim) * sizeof(int8_t)>(&barrier_V);
-    load_async_4D(sQ, &tensorMapQ, &barrier_Q, 0, bx * CTA_Q, head_id, batch_id);
-    load_async_4D(sK, &tensorMapK, &barrier_K, 0, 0, kv_head_id, batch_id);
-    load_async_4D(sV, &tensorMapV, &barrier_V, 0, 0, kv_head_id, batch_id);
+    load_async_4D(sQ, tensorMapQ, &barrier_Q, 0, bx * CTA_Q, head_id, batch_id);
+    load_async_4D(sK, tensorMapK, &barrier_K, 0, 0, kv_head_id, batch_id);
+    load_async_4D(sV, tensorMapV, &barrier_V, 0, 0, kv_head_id, batch_id);
   }
 
   float q_scale = Q_scale[q_scale_idx];
@@ -286,7 +294,7 @@ __global__ void qk_int8_sv_f8_attn_kernel(const __grid_constant__ CUtensorMap te
     if (threadIdx.x == 0)
     {
       expect_bytes<(CTA_K * head_dim) * sizeof(int8_t)>(&barrier_K);
-      load_async_4D(sK, &tensorMapK, &barrier_K, 0, iter * CTA_K, kv_head_id, batch_id);
+      load_async_4D(sK, tensorMapK, &barrier_K, 0, iter * CTA_K, kv_head_id, batch_id);
     }
 
     // convert RS to float
@@ -359,7 +367,7 @@ __global__ void qk_int8_sv_f8_attn_kernel(const __grid_constant__ CUtensorMap te
     if (threadIdx.x == 0)
     {
       expect_bytes<(CTA_K * head_dim) * sizeof(int8_t)>(&barrier_V);
-      load_async_4D(sV, &tensorMapV, &barrier_V, iter * CTA_K, 0, kv_head_id, batch_id);
+      load_async_4D(sV, tensorMapV, &barrier_V, iter * CTA_K, 0, kv_head_id, batch_id);
     }
   }
 
@@ -710,6 +718,18 @@ torch::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf(
             CUtensorMap tma_map_K = create_tensor_map_4D<CTA_K, HEAD_DIM>(reinterpret_cast<int8_t*>(key.data_ptr()), batch_size, num_kv_heads, kv_len, HEAD_DIM, stride_bz_k, stride_h_k, stride_seq_k);
             CUtensorMap tma_map_V = create_tensor_map_4D<HEAD_DIM, CTA_K>(reinterpret_cast<int8_t*>(value.data_ptr()), batch_size, num_kv_heads, HEAD_DIM, value.size(3), stride_bz_v, stride_h_v, stride_d_v);
 
+            DeviceTensorMaps h_pack;
+            h_pack.q = tma_map_Q;
+            h_pack.k = tma_map_K;
+            h_pack.v = tma_map_V;
+
+            DeviceTensorMaps* d_pack = nullptr;
+            cudaError_t err = cudaMalloc(&d_pack, sizeof(DeviceTensorMaps));
+            TORCH_CHECK(err == cudaSuccess, "cudaMalloc failed for DeviceTensorMaps");
+
+            err = cudaMemcpy(d_pack, &h_pack, sizeof(DeviceTensorMaps), cudaMemcpyHostToDevice);
+            TORCH_CHECK(err == cudaSuccess, "cudaMemcpy failed for DeviceTensorMaps");
+
             auto* kernel = qk_int8_sv_f8_attn_kernel<CTA_Q, CTA_K, NUM_THREADS, HEAD_DIM, static_cast<QuantGranularity>(QK_QUANT_GRAN), static_cast<QuantGranularity>(QK_QUANT_GRAN), DTypeOut, mask_mode, RETURN_LSE, false>;
             size_t sMemSize = CTA_Q * HEAD_DIM * sizeof(int8_t) + CTA_K * HEAD_DIM * sizeof(int8_t) + CTA_K * HEAD_DIM * sizeof(int8_t);
             cudaFuncSetAttribute(
@@ -718,9 +738,7 @@ torch::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf(
             
             dim3 grid(div_ceil(qo_len, CTA_Q), num_qo_heads, batch_size);
             kernel<<<grid, NUM_THREADS, sMemSize>>>(
-              tma_map_Q,
-              tma_map_K,
-              tma_map_V,
+              d_pack,
               reinterpret_cast<float*>(query_scale.data_ptr()),
               reinterpret_cast<float*>(key_scale.data_ptr()),
               nullptr,
@@ -728,6 +746,8 @@ torch::Tensor qk_int8_sv_f8_accum_f32_attn_inst_buf(
               (RETURN_LSE) ? reinterpret_cast<float*>(lse.data_ptr()) : nullptr,
               stride_bz_o, stride_h_o, stride_seq_o,
               qo_len, kv_len, num_kv_groups, sm_scale);
+
+            cudaFree(d_pack);
           });
         });
       });
@@ -888,6 +908,18 @@ torch::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
             CUtensorMap tma_map_K = create_tensor_map_4D<CTA_K, HEAD_DIM>(reinterpret_cast<int8_t*>(key.data_ptr()), batch_size, num_kv_heads, kv_len, HEAD_DIM, stride_bz_k, stride_h_k, stride_seq_k);
             CUtensorMap tma_map_V = create_tensor_map_4D<HEAD_DIM, CTA_K>(reinterpret_cast<int8_t*>(value.data_ptr()), batch_size, num_kv_heads, HEAD_DIM, value.size(3), stride_bz_v, stride_h_v, stride_d_v);
 
+            DeviceTensorMaps h_pack;
+            h_pack.q = tma_map_Q;
+            h_pack.k = tma_map_K;
+            h_pack.v = tma_map_V;
+
+            DeviceTensorMaps* d_pack = nullptr;
+            cudaError_t err = cudaMalloc(&d_pack, sizeof(DeviceTensorMaps));
+            TORCH_CHECK(err == cudaSuccess, "cudaMalloc failed for DeviceTensorMaps");
+
+            err = cudaMemcpy(d_pack, &h_pack, sizeof(DeviceTensorMaps), cudaMemcpyHostToDevice);
+            TORCH_CHECK(err == cudaSuccess, "cudaMemcpy failed for DeviceTensorMaps");
+
             auto* kernel = qk_int8_sv_f8_attn_kernel<CTA_Q, CTA_K, NUM_THREADS, HEAD_DIM,  static_cast<QuantGranularity>(QK_QUANT_GRAN), static_cast<QuantGranularity>(QK_QUANT_GRAN), DTypeOut, mask_mode, RETURN_LSE, true>;
             size_t sMemSize = CTA_Q * HEAD_DIM * sizeof(int8_t) + CTA_K * HEAD_DIM * sizeof(int8_t) + CTA_K * HEAD_DIM * sizeof(int8_t);
             cudaFuncSetAttribute(
@@ -896,9 +928,7 @@ torch::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
             
             dim3 grid(div_ceil(qo_len, CTA_Q), num_qo_heads, batch_size);
             kernel<<<grid, NUM_THREADS, sMemSize>>>(
-              tma_map_Q,
-              tma_map_K,
-              tma_map_V,
+              d_pack,
               reinterpret_cast<float*>(query_scale.data_ptr()),
               reinterpret_cast<float*>(key_scale.data_ptr()),
               reinterpret_cast<float*>(value_scale.data_ptr()),
@@ -906,6 +936,8 @@ torch::Tensor qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
               (RETURN_LSE) ? reinterpret_cast<float*>(lse.data_ptr()) : nullptr,
               stride_bz_o, stride_h_o, stride_seq_o,
               qo_len, kv_len, num_kv_groups, sm_scale);
+
+            cudaFree(d_pack);
           });
         });
       });
diff --git a/sageattention3_blackwell/sageattn3/blackwell/kernel_traits.h b/sageattention3_blackwell/sageattn3/blackwell/kernel_traits.h
index 5035f4b5..59bcf023 100644
--- a/sageattention3_blackwell/sageattn3/blackwell/kernel_traits.h
+++ b/sageattention3_blackwell/sageattn3/blackwell/kernel_traits.h
@@ -95,7 +95,7 @@ struct Flash_fwd_kernel_traits {
     using ElementAccum = float;
     using ElementOut = ElementOut_;
     using index_t = int64_t;
-    static constexpr auto SFVectorSize = 16;
+    static constexpr int SFVectorSize = 16;
     using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
     using ClusterShape_MNK = Shape<_1, _1, _1>;
     using PermTileM = decltype(cute::min(size<0>(TileShape_MNK{}), _128{}));
@@ -149,7 +149,28 @@ struct Flash_fwd_kernel_traits {
     using SmemCopyAtomKV = Copy_Atom<SM75_U32x4_LDSM_N, Element>;
     using SmemCopyAtomSF = Copy_Atom<UniversalCopy<ElementSF>, ElementSF>;
     using SmemCopyAtomDS = Copy_Atom<UniversalCopy<float>, float>;
+#if defined(_MSC_VER)
 
+    using BlkScaledConfig = ::flash::BlockScaledConfig<SFVectorSize>;
+    // Inline the definitions to avoid MSVC dependent-name quirks
+    using SfAtom = Layout<
+        Shape< Shape<_16, _4>, Shape<Int<SFVectorSize>, Int<4>>>,
+        Stride<Stride<_16, _4>, Stride<_0, _1>>
+    >;
+    using LayoutSF = decltype(
+      blocked_product(
+        SfAtom{},
+        make_layout(
+          make_shape(int32_t(0), int32_t(0), int32_t(0), int32_t(0)),
+          make_stride(int32_t(0), _1{}, int32_t(0), int32_t(0))
+        )
+      )
+    );    
+    using SmemLayoutAtomSFQ = decltype(::flash::BlockScaledConfig<SFVectorSize>::deduce_smem_layoutSFQ(TiledMmaQK{}, TileShape_MNK{}));
+    using SmemLayoutAtomSFK = decltype(::flash::BlockScaledConfig<SFVectorSize>::deduce_smem_layoutSFKV(TiledMmaQK{}, TileShape_MNK{}));
+    using SmemLayoutAtomSFV = decltype(::flash::BlockScaledConfig<SFVectorSize>::deduce_smem_layoutSFKV(TiledMmaPV{}, TileShape_MNK{}));
+    using SmemLayoutAtomSFVt = decltype(::flash::BlockScaledConfig<SFVectorSize>::deduce_smem_layoutSFVt(TiledMmaPV{}, Shape<Int<kBlockM>, Int<kHeadDim>, Int<kBlockN>>{}));
+#else
     using BlkScaledConfig = flash::BlockScaledConfig<SFVectorSize>;
     using LayoutSF = typename BlkScaledConfig::LayoutSF;
     using SfAtom = typename BlkScaledConfig::SfAtom;
@@ -157,6 +178,7 @@ struct Flash_fwd_kernel_traits {
     using SmemLayoutAtomSFK = decltype(BlkScaledConfig::deduce_smem_layoutSFKV(TiledMmaQK{}, TileShape_MNK{}));
     using SmemLayoutAtomSFV = decltype(BlkScaledConfig::deduce_smem_layoutSFKV(TiledMmaPV{}, TileShape_MNK{}));
     using SmemLayoutAtomSFVt = decltype(BlkScaledConfig::deduce_smem_layoutSFVt(TiledMmaPV{}, Shape<Int<kBlockM>, Int<kHeadDim>, Int<kBlockN>>{}));
+#endif
     using LayoutSFP = decltype(
       make_layout(
           make_shape(make_shape(_16{}, _4{}), _1{}, Int<kBlockN / 64>{}),
diff --git a/sageattention3_blackwell/sageattn3/blackwell/kernel_ws.h b/sageattention3_blackwell/sageattn3/blackwell/kernel_ws.h
index 5adc82ae..0406eb35 100644
--- a/sageattention3_blackwell/sageattn3/blackwell/kernel_ws.h
+++ b/sageattention3_blackwell/sageattn3/blackwell/kernel_ws.h
@@ -38,12 +38,11 @@ namespace flash {
 using namespace cute;
 
 template <typename Ktraits, bool Is_causal, typename TileScheduler>
-__global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp, 1)
-    compute_attn_ws(CUTE_GRID_CONSTANT Flash_fwd_params const params,
-                    CUTE_GRID_CONSTANT typename CollectiveMainloopFwd<Ktraits, Is_causal>::Params const mainloop_params,
-                    CUTE_GRID_CONSTANT typename CollectiveEpilogueFwd<Ktraits>::Params const epilogue_params,
-                    CUTE_GRID_CONSTANT typename TileScheduler::Params const scheduler_params
-                    ) {
+__device__ inline void
+compute_attn_ws_impl(Flash_fwd_params const &params,
+                     typename CollectiveMainloopFwd<Ktraits, Is_causal>::Params const &mainloop_params,
+                     typename CollectiveEpilogueFwd<Ktraits>::Params const &epilogue_params,
+                     typename TileScheduler::Params const &scheduler_params) {
 
     using Element = typename Ktraits::Element;
     using ElementAccum = typename Ktraits::ElementAccum;
@@ -199,4 +198,37 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
     }
 }
 
+#if defined(_MSC_VER)
+// MSVC requires special handling for kernel parameters to avoid alignment issues.
+
+template <typename Ktraits, bool Is_causal, typename TileScheduler>
+__global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp, 1)
+compute_attn_ws(Flash_fwd_params const *params,
+                typename CollectiveMainloopFwd<Ktraits, Is_causal>::Params const *mainloop_params,
+                typename CollectiveEpilogueFwd<Ktraits>::Params const *epilogue_params,
+                typename TileScheduler::Params const *scheduler_params) {
+    compute_attn_ws_impl<Ktraits, Is_causal, TileScheduler>(
+        *params, *mainloop_params, *epilogue_params, *scheduler_params);
+}
+
+#else
+
+template <typename Ktraits, bool Is_causal, typename TileScheduler>
+__global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp, 1)
+    compute_attn_ws(CUTE_GRID_CONSTANT Flash_fwd_params const params,
+                    CUTE_GRID_CONSTANT
+                        typename CollectiveMainloopFwd<Ktraits, Is_causal>::Params const
+                            mainloop_params,
+                    CUTE_GRID_CONSTANT
+                        typename CollectiveEpilogueFwd<Ktraits>::Params const
+                            epilogue_params,
+                    CUTE_GRID_CONSTANT
+                        typename TileScheduler::Params const scheduler_params) {
+
+    compute_attn_ws_impl<Ktraits, Is_causal, TileScheduler>(
+        params, mainloop_params, epilogue_params, scheduler_params);
+}
+
+#endif  // _MSC_VER
+
 } // namespace flash
diff --git a/sageattention3_blackwell/sageattn3/blackwell/launch.h b/sageattention3_blackwell/sageattn3/blackwell/launch.h
index 3167df32..683be2ea 100644
--- a/sageattention3_blackwell/sageattn3/blackwell/launch.h
+++ b/sageattention3_blackwell/sageattn3/blackwell/launch.h
@@ -90,7 +90,50 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
     dim3 block_dims(ctaSize);
     dim3 cluster_dims(size<0>(ClusterShape{}), size<1>(ClusterShape{}), size<2>(ClusterShape{}));
     cutlass::ClusterLaunchParams launch_params{grid_dims, block_dims, cluster_dims, smem_size, stream};
+
+#if defined(_MSC_VER)
+    // MSVC: Use parameter packing to pass parameters to avoid over-aligned parameters reporting a C2719 error.
+
+    struct DeviceParamsPack {
+        Flash_fwd_params params;
+        typename CollectiveMainloop::Params mainloop;
+        typename CollectiveEpilogue::Params epilogue;
+        typename Scheduler::Params scheduler;
+    };
+
+    DeviceParamsPack h_pack{params, mainloop_params, epilogue_params, scheduler_params};
+
+    DeviceParamsPack *d_pack = nullptr;
+    C10_CUDA_CHECK(cudaMallocAsync(&d_pack, sizeof(DeviceParamsPack), stream));
+    C10_CUDA_CHECK(cudaMemcpyAsync(
+        d_pack, &h_pack, sizeof(DeviceParamsPack),
+        cudaMemcpyHostToDevice, stream));
+    
+    char *base_h = reinterpret_cast<char*>(&h_pack);
+    auto off_params   = reinterpret_cast<char*>(&h_pack.params)    - base_h;
+    auto off_mainloop = reinterpret_cast<char*>(&h_pack.mainloop)  - base_h;
+    auto off_epilogue = reinterpret_cast<char*>(&h_pack.epilogue)  - base_h;
+    auto off_sched    = reinterpret_cast<char*>(&h_pack.scheduler) - base_h;
+
+    char *base_d = reinterpret_cast<char*>(d_pack);
+
+    auto d_params = reinterpret_cast<Flash_fwd_params*>(base_d + off_params);
+    auto d_mainloop_params =
+        reinterpret_cast<typename CollectiveMainloop::Params *>(base_d + off_mainloop);
+    auto d_epilogue_params =
+        reinterpret_cast<typename CollectiveEpilogue::Params *>(base_d + off_epilogue);
+    auto d_scheduler_params =
+        reinterpret_cast<typename Scheduler::Params *>(base_d + off_sched);
+    
+    cutlass::launch_kernel_on_cluster(
+        launch_params, kernel,
+        d_params, d_mainloop_params, d_epilogue_params, d_scheduler_params);
+    
+    C10_CUDA_CHECK(cudaFreeAsync(d_pack, stream));
+
+#else
     cutlass::launch_kernel_on_cluster(launch_params, kernel, params, mainloop_params, epilogue_params, scheduler_params);
+#endif
     
     C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
diff --git a/sageattention3_blackwell/setup.py b/sageattention3_blackwell/setup.py
index 04e3888b..538a4851 100644
--- a/sageattention3_blackwell/setup.py
+++ b/sageattention3_blackwell/setup.py
@@ -59,18 +59,51 @@ def append_nvcc_threads(nvcc_extra_args):
     _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
     if bare_metal_version < Version("12.8"):
         raise RuntimeError("Sage3 is only supported on CUDA 12.8 and above")
-    cc_major, cc_minor = torch.cuda.get_device_capability()
-    if (cc_major, cc_minor) == (10, 0):  # sm_100
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_100a,code=sm_100a")
-    elif (cc_major, cc_minor) == (12, 0):  # sm_120
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_120a,code=sm_120a")
-    elif (cc_major, cc_minor) == (12, 1):  # sm_121
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_121a,code=sm_121a")
+    
+    compute_capabilities = set()
+
+    # Prefer TORCH_CUDA_ARCH_LIST if explicitly specified (works without GPUs)
+    arch_list_env = os.getenv("TORCH_CUDA_ARCH_LIST", "").strip()
+    if arch_list_env:
+        for item in arch_list_env.replace(",", ";").split(";"):
+            it = item.strip()
+            if not it:
+                continue
+            it = it.lower().replace("sm_", "").replace("compute_", "")
+            it = it.replace("a", "")
+            if it.endswith("+ptx"):
+                it = it[:-4]
+                compute_capabilities.add(f"{it}+PTX")
+            else:
+                if len(it) == 2 and it.isdigit():
+                    it = f"{it[0]}.{it[1]}"
+                compute_capabilities.add(it)
+
+        for capability in compute_capabilities:
+            if capability.startswith("10.0"):
+                cc_flag.append("-gencode")
+                cc_flag.append("arch=compute_100a,code=sm_100a")
+            elif capability.startswith("12.0"):
+                cc_flag.append("-gencode")
+                cc_flag.append("arch=compute_120a,code=sm_120a")
+            elif capability.startswith("12.1"):
+                cc_flag.append("-gencode")
+                cc_flag.append("arch=compute_121a,code=sm_121a")
+            else:
+                raise RuntimeError(f"Unsupported GPU capability specified: {capability}")
+
+    # If no TORCH_CUDA_ARCH_LIST, try to detect the GPU capability (only works with GPUs)
     else:
-        raise RuntimeError("Unsupported GPU")
+        cc_major, cc_minor = torch.cuda.get_device_capability()
+        if (cc_major, cc_minor) == (10, 0):  # sm_100
+            cc_flag.append("-gencode")
+            cc_flag.append("arch=compute_100a,code=sm_100a")
+        elif (cc_major, cc_minor) == (12, 0):  # sm_120
+            cc_flag.append("-gencode")
+            cc_flag.append("arch=compute_120a,code=sm_120a")
+        elif (cc_major, cc_minor) == (12, 1):  # sm_121
+            cc_flag.append("-gencode")
+            cc_flag.append("arch=compute_121a,code=sm_121a")
 
     # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as
     # torch._C._GLIBCXX_USE_CXX11_ABI
@@ -100,6 +133,7 @@ def append_nvcc_threads(nvcc_extra_args):
         "--use_fast_math",
         # "--ptxas-options=-v",  # printing out number of registers
         "--ptxas-options=--verbose,--warn-on-local-memory-usage",  # printing out number of registers
+        "-diag-suppress=177",
         "-lineinfo",
         "-DCUTLASS_DEBUG_TRACE_LEVEL=0",  # Can toggle for debugging
         "-DNDEBUG",  # Important, otherwise performance is severely impacted
@@ -114,12 +148,20 @@ def append_nvcc_threads(nvcc_extra_args):
         cutlass_dir / "tools" / "util" / "include",
     ]
 
+    if os.name == "nt":
+        nvcc_flags += ["-D_WIN32=1", "-DUSE_CUDA=1"]
+        cxx_flags = ["/std:c++17", "/Zc:__cplusplus", "/bigobj", "/MD", "/permissive-"]
+        nvcc_flags += [f"-Xcompiler={flag}" for flag in cxx_flags]
+        cxx_flags += ["/O2"]
+    else:
+        cxx_flags = ["-O3", "-std=c++17"]        
+
     ext_modules.append(
         CUDAExtension(
             name="fp4attn_cuda",
             sources=["sageattn3/blackwell/api.cu"],
             extra_compile_args={
-                "cxx": ["-O3", "-std=c++17"],
+                "cxx": cxx_flags,
                 "nvcc": append_nvcc_threads(
                     nvcc_flags + ["-DEXECMODE=0"] + cc_flag
                 ),
@@ -134,7 +176,7 @@ def append_nvcc_threads(nvcc_extra_args):
             name="fp4quant_cuda",
             sources=["sageattn3/quantization/fp4_quantization_4d.cu"],
             extra_compile_args={
-                "cxx": ["-O3", "-std=c++17"],
+                "cxx": cxx_flags,
                 "nvcc": append_nvcc_threads(
                     nvcc_flags + ["-DEXECMODE=0"] + cc_flag
                 ),
diff --git a/setup.py b/setup.py
index 6b2c5b43..41b9a54c 100644
--- a/setup.py
+++ b/setup.py
@@ -48,27 +48,36 @@
     SUPPORTED_ARCHS = {"8.0", "8.6", "8.9", "9.0", "10.0", "12.0", "12.1"}
 
     # Compiler flags.
-    CXX_FLAGS = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"]
-    NVCC_FLAGS = [
-        "-O3",
-        "-std=c++17",
-        "-U__CUDA_NO_HALF_OPERATORS__",
-        "-U__CUDA_NO_HALF_CONVERSIONS__",
-        "--use_fast_math",
-        "--threads=8",
-        "-Xptxas=-v",
-        "-diag-suppress=174",
-    ]
-
-    # Append flags from env if provided
-    cxx_append = os.getenv("CXX_APPEND_FLAGS", "").strip()
-    if cxx_append:
-        CXX_FLAGS += cxx_append.split()
-    nvcc_append = os.getenv("NVCC_APPEND_FLAGS", "").strip()
-    if nvcc_append:
-        NVCC_FLAGS += nvcc_append.split()
-
-    ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
+    if os.name == "nt":
+        CXX_FLAGS = ["/Zi", "/openmp", "/std:c++17", "/Zc:__cplusplus", "/DENABLE_BF16", "/MD", "/permissive-"]
+        NVCC_FLAGS = [
+            "-O3",
+            "-std=c++17",
+            "-U__CUDA_NO_HALF_OPERATORS__",
+            "-U__CUDA_NO_HALF_CONVERSIONS__",
+            "--use_fast_math",
+            "--threads=8",
+            "-Xptxas=-v",
+            "-diag-suppress=174",            
+            "-D_WIN32=1",
+            "-DUSE_CUDA=1",
+        ]
+        NVCC_FLAGS += [f"-Xcompiler={flag}" for flag in CXX_FLAGS]
+        CXX_FLAGS += ["/O2"]
+    else:
+        CXX_FLAGS = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"]
+        NVCC_FLAGS = [
+            "-O3",
+            "-std=c++17",
+            "-U__CUDA_NO_HALF_OPERATORS__",
+            "-U__CUDA_NO_HALF_CONVERSIONS__",
+            "--use_fast_math",
+            "--threads=8",
+            "-Xptxas=-v",
+            "-diag-suppress=174",
+        ]
+
+    ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI and os.name != "nt" else 0
     CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
     NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 
@@ -140,6 +149,7 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
             "CUDA 12.8 or higher is required for compute capability 12.0.")
 
     # Add target compute capabilities to NVCC flags.
+    NVCC_FLAGS_SM90 = list(NVCC_FLAGS)
     for capability in compute_capabilities:
         if capability.startswith("8.0"):
             HAS_SM80 = True
@@ -153,15 +163,18 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
         elif capability.startswith("9.0"):
             HAS_SM90 = True
             num = "90a"
+            NVCC_FLAGS_SM90 += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
+            if capability.endswith("+PTX"):
+                NVCC_FLAGS_SM90 += ["-gencode", f"arch=compute_{num},code=compute_{num}"]
         elif capability.startswith("10.0"):
             HAS_SM100 = True
-            num = "100a"
+            num = "100a"            
         elif capability.startswith("12.0"):
             HAS_SM120 = True
-            num = "120a"
+            num = "120a"            
         elif capability.startswith("12.1"):
             HAS_SM121 = True
-            num = "121a"
+            num = "121a"            
         else:
             continue
         NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
@@ -201,6 +214,12 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
             )
         )
 
+    cuda_lib = []
+    if os.name == "nt":
+        cuda_lib = ["cuda.lib"]
+    else:
+        cuda_lib = ["-lcuda"]
+
     if HAS_SM90:
         ext_modules.append(
             CUDAExtension(
@@ -209,8 +228,8 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
                     "csrc/qattn/pybind_sm90.cpp",
                     "csrc/qattn/qk_int_sv_f8_cuda_sm90.cu",
                 ],
-                extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
-                extra_link_args=['-lcuda'],
+                extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS_SM90},
+                extra_link_args=cuda_lib,
             )
         )