Skip to content

Commit 270e4a7

Browse files
update
1 parent cfc12a6 commit 270e4a7

File tree

2 files changed

+76
-54
lines changed

2 files changed

+76
-54
lines changed

Jenkinsfile

Lines changed: 75 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ spec:
3535
SONAR_SERVER = "sonarqube"
3636
GCP_SA_CRED = "gcp-sa"
3737

38-
// Optional override (leave empty) — if you know a working path, set it to one of:
38+
// Optional override (leave empty). Accepts:
3939
// - gs://hadoop-lib/hadoop-streaming/hadoop-streaming.jar
4040
// - gs://<your-bucket>/lib/hadoop-streaming-3.3.6.jar
4141
// - file:///usr/lib/hadoop-mapreduce/hadoop-streaming.jar
@@ -112,28 +112,23 @@ spec:
112112
echo "== Describe Dataproc cluster ==" && gcloud dataproc clusters describe "${CLUSTER_NAME}" --region "${REGION}" >/dev/null
113113
echo "== Probe GCS bucket ==" && gsutil ls "gs://${BUCKET}/" || true
114114
115-
# Helper: robust downloader (curl -> wget -> python)
115+
# Helper downloader (curl -> wget -> python3)
116116
dl() {
117117
local url="$1" out="$2"
118-
if command -v curl >/dev/null 2>&1; then
119-
curl -fSL "$url" -o "$out" && return 0
120-
fi
121-
if command -v wget >/dev/null 2>&1; then
122-
wget -O "$out" "$url" && return 0
123-
fi
118+
if command -v curl >/dev/null 2>&1; then curl -fSL "$url" -o "$out" && return 0; fi
119+
if command -v wget >/dev/null 2>&1; then wget -O "$out" "$url" && return 0; fi
124120
if command -v python3 >/dev/null 2>&1; then
125121
python3 - "$url" "$out" << 'PY'
126122
import sys, urllib.request
127-
u,o=sys.argv[1],sys.argv[2]
128-
urllib.request.urlretrieve(u,o)
123+
u,o=sys.argv[1],sys.argv[2]; urllib.request.urlretrieve(u,o)
129124
PY
130125
return 0
131126
fi
132127
echo "No downloader available (curl/wget/python3)"; return 1
133128
}
134129
135130
# Resolve streaming jar
136-
HSJ="${HADOOP_STREAMING_JAR:-}" # safe default avoids 'unbound variable'
131+
HSJ="${HADOOP_STREAMING_JAR:-}"
137132
RESOLVED_JAR=""
138133
139134
# 1) Use provided env if valid
@@ -146,13 +141,12 @@ PY
146141
echo "Provided HADOOP_STREAMING_JAR not found: $HSJ"
147142
fi
148143
else
149-
# allow file:/// (cannot preflight)
150-
RESOLVED_JAR="$HSJ"
144+
RESOLVED_JAR="$HSJ" # allow file:///
151145
echo "Using provided non-GCS jar path: $RESOLVED_JAR"
152146
fi
153147
fi
154148
155-
# 2) Try public GCS locations
149+
# 2) Try public GCS
156150
if [[ -z "$RESOLVED_JAR" ]]; then
157151
for C in \
158152
"gs://hadoop-lib/hadoop-streaming/hadoop-streaming.jar" \
@@ -166,14 +160,13 @@ PY
166160
done
167161
fi
168162
169-
# 3) Fallback to cluster local path (will still stage a known-good jar next)
163+
# 3) Fallback to cluster local (then stage a known-good jar)
170164
if [[ -z "$RESOLVED_JAR" ]]; then
171165
RESOLVED_JAR="file:///usr/lib/hadoop-mapreduce/hadoop-streaming.jar"
172166
echo "Fallback to cluster-local path: $RESOLVED_JAR"
173167
fi
174168
175169
# 4) Stage known-good jar to your bucket and switch to it
176-
# (ensures success even if cluster-local path doesn't exist)
177170
if [[ "$RESOLVED_JAR" == file://* ]]; then
178171
HVER="3.3.6"
179172
LOCAL="hadoop-streaming-${HVER}.jar"
@@ -192,7 +185,6 @@ PY
192185
echo "Resolved jar (staged): $RESOLVED_JAR"
193186
fi
194187
195-
# Persist for next stage
196188
echo "export HADOOP_STREAMING_RESOLVED_JAR=\"$RESOLVED_JAR\"" > .resolved_jar.env
197189
echo "Preflight OK. Using streaming jar: $RESOLVED_JAR"
198190
'''
@@ -201,7 +193,7 @@ PY
201193
}
202194
}
203195

204-
stage('Prep inputs (upload .py to GCS)') {
196+
stage('Stage code (mapper/reducer) & data to GCS') {
205197
steps {
206198
container('cloud-sdk') {
207199
withCredentials([file(credentialsId: env.GCP_SA_CRED, variable: 'GOOGLE_APPLICATION_CREDENTIALS')]) {
@@ -212,18 +204,63 @@ PY
212204
fi
213205
gcloud config set project "${PROJECT_ID}"
214206
215-
INPUT_PATH="gs://${BUCKET}/inputs/${JOB_NAME}/${BUILD_NUMBER}"
207+
JOB_ROOT="gs://${BUCKET}/${JOB_NAME}/${BUILD_NUMBER}"
208+
CODE_PREFIX="${JOB_ROOT}/code"
209+
DATA_PREFIX="${JOB_ROOT}/data"
210+
211+
# discover mapper / reducer within repo
212+
MAP="${MAP:-}"
213+
RED="${RED:-}"
214+
if [[ -z "$MAP" ]]; then
215+
if [[ -f mapper.py ]]; then MAP=mapper.py; else MAP="$(git ls-files | grep -E '^mapper\\.py$|/?mapper\\.py$' | head -n1)"; fi
216+
fi
217+
if [[ -z "$RED" ]]; then
218+
if [[ -f reducer.py ]]; then RED=reducer.py; else RED="$(git ls-files | grep -E '^reducer\\.py$|/?reducer\\.py$' | head -n1)"; fi
219+
fi
220+
[[ -n "$MAP" && -n "$RED" ]] || { echo "mapper.py/reducer.py not found in repo"; exit 1; }
221+
echo "Mapper: $MAP"
222+
echo "Reducer: $RED"
223+
224+
# clean and upload ONLY mapper & reducer under code/
225+
gsutil -m rm -r "${CODE_PREFIX}" >/dev/null 2>&1 || true
226+
gsutil -m cp "$MAP" "${CODE_PREFIX}/"
227+
gsutil -m cp "$RED" "${CODE_PREFIX}/"
216228
217-
gsutil -m rm -r "${INPUT_PATH}" >/dev/null 2>&1 || true
229+
# pick data files from repo (flat) – .txt/.csv/.log by default
230+
gsutil -m rm -r "${DATA_PREFIX}" >/dev/null 2>&1 || true
231+
mkdir -p /tmp/upload_data
218232
219-
mkdir -p /tmp/upload_py
233+
found=0
220234
while IFS= read -r f; do
221-
mkdir -p "/tmp/upload_py/$(dirname "$f")"
222-
cp "$f" "/tmp/upload_py/$f"
223-
done < <(git ls-files '*.py')
235+
cp "$f" "/tmp/upload_data/$(basename "$f")"
236+
found=1
237+
done < <(git ls-files | grep -Ei '\\.(txt|csv|log)$' || true)
238+
239+
# if no data files in repo, create a tiny sample
240+
if [[ "$found" -eq 0 ]]; then
241+
echo "No data files found (*.txt, *.csv, *.log). Creating sample..."
242+
cat > /tmp/upload_data/sample.txt <<EOF
243+
alpha
244+
beta
245+
gamma
246+
alpha
247+
beta
248+
alpha
249+
EOF
250+
fi
251+
252+
gsutil -m cp /tmp/upload_data/* "${DATA_PREFIX}/"
224253
225-
(cd /tmp/upload_py && gsutil -m cp -r . "${INPUT_PATH}/")
226-
echo "Uploaded inputs to ${INPUT_PATH}"
254+
# persist paths for submit stage
255+
{
256+
echo "export CODE_PREFIX='${CODE_PREFIX}'"
257+
echo "export DATA_PREFIX='${DATA_PREFIX}'"
258+
echo "export MAP_BASENAME='$(basename "$MAP")'"
259+
echo "export RED_BASENAME='$(basename "$RED")'"
260+
} >> .resolved_jar.env
261+
262+
echo "Staged code -> ${CODE_PREFIX}"
263+
echo "Staged data -> ${DATA_PREFIX}"
227264
'''
228265
}
229266
}
@@ -242,43 +279,28 @@ PY
242279
gcloud config set project "${PROJECT_ID}"
243280
gcloud config set dataproc/region "${REGION}"
244281
245-
# load resolved jar
282+
# load resolved vars
246283
source .resolved_jar.env
247-
echo "Submitting with streaming JAR: ${HADOOP_STREAMING_RESOLVED_JAR}"
284+
echo "Streaming JAR: ${HADOOP_STREAMING_RESOLVED_JAR}"
285+
echo "CODE_PREFIX : ${CODE_PREFIX}"
286+
echo "DATA_PREFIX : ${DATA_PREFIX}"
248287
249-
INPUT_PREFIX="gs://${BUCKET}/inputs/${JOB_NAME}/${BUILD_NUMBER}"
250288
OUT="gs://${BUCKET}/results/${JOB_NAME}/${BUILD_NUMBER}"
251-
252-
# discover mapper / reducer
253-
MAP="${MAP:-}"
254-
RED="${RED:-}"
255-
if [[ -z "$MAP" ]]; then
256-
if [[ -f mapper.py ]]; then MAP=mapper.py; else MAP="$(git ls-files | grep -E '/?mapper\\.py$' | head -n1)"; fi
257-
fi
258-
if [[ -z "$RED" ]]; then
259-
if [[ -f reducer.py ]]; then RED=reducer.py; else RED="$(git ls-files | grep -E '/?reducer\\.py$' | head -n1)"; fi
260-
fi
261-
[[ -n "$MAP" && -n "$RED" ]] || { echo "mapper.py/reducer.py not found"; exit 1; }
262-
263-
echo "Using mapper: $MAP"
264-
echo "Using reducer: $RED"
265-
266-
MAP_GS="${INPUT_PREFIX}/${MAP}"
267-
RED_GS="${INPUT_PREFIX}/${RED}"
268-
269289
gsutil -m rm -r "${OUT}" >/dev/null 2>&1 || true
270290
291+
# Use files from flat data prefix only (avoid directories)
292+
# Ship mapper/reducer via -files from code prefix
271293
gcloud dataproc jobs submit hadoop \
272294
--cluster="${CLUSTER_NAME}" \
273295
--region="${REGION}" \
274296
--jar="${HADOOP_STREAMING_RESOLVED_JAR}" \
275297
-- \
276-
-D mapreduce.job.maps=4 \
277-
-D mapreduce.job.reduces=2 \
278-
-files "${MAP_GS},${RED_GS}" \
279-
-mapper "python3 $(basename "${MAP}")" \
280-
-reducer "python3 $(basename "${RED}")" \
281-
-input "${INPUT_PREFIX}" \
298+
-D mapreduce.job.maps=2 \
299+
-D mapreduce.job.reduces=1 \
300+
-files "${CODE_PREFIX}/${MAP_BASENAME},${CODE_PREFIX}/${RED_BASENAME}" \
301+
-mapper "python3 ${MAP_BASENAME}" \
302+
-reducer "python3 ${RED_BASENAME}" \
303+
-input "${DATA_PREFIX}/*" \
282304
-output "${OUT}"
283305
284306
gsutil cat "${OUT}"/part-* | tee line_counts.txt

python/bad.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
def oops():
2-
return 11/0
2+
return 1/0

0 commit comments

Comments
 (0)