-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathutils.py
More file actions
122 lines (105 loc) · 3.93 KB
/
utils.py
File metadata and controls
122 lines (105 loc) · 3.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import requests
import polars as pl
def fetch_all_gitlab_issues(
project_path: str,
gitlab_base_url: str = "https://gitlab.opencode.de",
per_page: int = 100,
timeout: int = 30,
):
"""
Fetch all issues from a public GitLab project.
project_path: "namespace/project", e.g. "dstack/d-stack-home"
returns: list of issue JSON objects
"""
session = requests.Session()
session.headers.update({"Accept": "application/json"})
# URL-encode project path
project_path_enc = project_path.replace("/", "%2F")
base_api = f"{gitlab_base_url}/api/v4/projects/{project_path_enc}/issues"
issues = []
page = 1
while True:
resp = session.get(
base_api,
params={
"state": "all",
"per_page": per_page,
"page": page,
"order_by": "created_at",
"sort": "asc",
},
timeout=timeout,
)
resp.raise_for_status()
batch = resp.json()
if not batch:
break
issues.extend(batch)
page += 1
return issues
def clean_issues_df(df: pl.DataFrame, columns_to_use: list, rows_to_exclude: list) -> pl.DataFrame:
df = df.with_columns([
pl.col("author").struct.field("id").alias("author_id"),
pl.col("author").struct.field("name").alias("author_name"),
pl.col("author").struct.field("state").alias("author_state"),
])
# Parse datetime columns - closed_at may be null for open issues
df = df.with_columns([
pl.col("created_at").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.fZ"),
pl.col("updated_at").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.fZ"),
])
# Handle closed_at which is null for open issues
if "closed_at" in df.columns:
df = df.with_columns([
pl.col("closed_at")
.cast(pl.String)
.str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.fZ", strict=False)
])
df = df.filter(~pl.col("iid").is_in(rows_to_exclude))
df = df.unique(subset=["title", "description"])
return df.select(columns_to_use)
def prepare_issues_df(df: pl.DataFrame, desc_to_exclude: list) -> pl.DataFrame:
# clean description
df = df.with_columns(
desc_clean = pl.col("description").str.replace_all("**Feedback:** <br>", "", literal=True)
)
# get more insights on where issue comes from
df = df.with_columns(
is_from_form = pl.col("title").str.starts_with("Feedback für die Seite"),
form_page = (
pl.when(pl.col("title").str.starts_with("Feedback für die Seite"))
.then(
pl.col("title")
.str.replace("^Feedback für die Seite", "")
.str.strip_chars()
)
.otherwise(pl.lit("Via OpenCode"))
)
)
df = df.filter(~pl.col("desc_clean").is_in(desc_to_exclude))
return df
def postprocess_issues(df: pl.DataFrame, label_version: int) -> pl.DataFrame:
"""
Postprocess the labeled issues DataFrame.
- Add "Unklar" label where labels is empty
- Exclude certain pages
- Clean up form_page
"""
# Add "Unklar" to empty labels
df = df.with_columns(
pl.when(pl.col(f"labels_v{label_version}").list.len() == 0)
.then(pl.lit(["Unklar"]))
.otherwise(pl.col(f"labels_v{label_version}"))
.alias(f"labels_v{label_version}")
)
# Exclude certain pages
exclude_pages = ['/beteiligung?utm_source=chatgpt.com', '/wtf', '/landkarte/ Tech-Stack Aufnahmekriterien & Prozess']
df = df.filter(~pl.col("form_page").is_in(exclude_pages))
# Clean form_page: remove trailing / unless it's "/", then replace with "home"
df = df.with_columns(
pl.when(pl.col("form_page") == "/")
.then(pl.lit("home"))
.otherwise(pl.col("form_page").str.strip_suffix("/"))
.alias("form_page")
)
return df