Merge pull request #1 from Fabryprog/spark-3.4.0

Fabryprog · web-flow · commit d7768bace439 · 2024-04-12T11:56:35.000+02:00
Spark 3.4.0
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,2 @@
 # Ignore data files
-*.csv
+#*.csv
diff --git a/Dockerfile b/Dockerfile
@@ -7,8 +7,8 @@ RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)"
 
 # Fix the value of PYTHONHASHSEED
 # Note: this is needed when you use Python 3.3 or greater
-ENV SPARK_VERSION=3.0.2 \
-HADOOP_VERSION=3.2 \
+ENV SPARK_VERSION=3.4.0 \
+HADOOP_VERSION=3 \
 SPARK_HOME=/opt/spark \
 PYTHONHASHSEED=1
 
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@ The following steps will make you run your spark cluster's containers.
 
 
 ```sh
-docker build -t cluster-apache-spark:3.0.2 .
+docker build -t cluster-apache-spark:3.4.0 .
 ```
 
 ## Run the docker-compose
diff --git a/apps/main.py b/apps/main.py
@@ -10,7 +10,7 @@ def init_spark():
   return sql,sc
 
 def main():
-  url = "jdbc:postgresql://demo-database:5432/mta_data"
+  url = "jdbc:postgresql://demo-database:5432/postgres"
   properties = {
     "user": "postgres",
     "password": "casa1234",
@@ -19,17 +19,12 @@ def main():
   file = "/opt/spark-data/MTA_2014_08_01.csv"
   sql,sc = init_spark()
 
-  df = sql.read.load(file,format = "csv", inferSchema="true", sep="\t", header="true"
-      ) \
-      .withColumn("report_hour",date_format(col("time_received"),"yyyy-MM-dd HH:00:00")) \
-      .withColumn("report_date",date_format(col("time_received"),"yyyy-MM-dd"))
+  df = sql.read.load(file,format = "csv", inferSchema="true", sep="\t", header="true")
   
   # Filter invalid coordinates
-  df.where("latitude <= 90 AND latitude >= -90 AND longitude <= 180 AND longitude >= -180") \
-    .where("latitude != 0.000000 OR longitude !=  0.000000 ") \
-    .write \
+  df.write \
     .jdbc(url=url, table="mta_reports", mode='append', properties=properties) \
     .save()
   
 if __name__ == '__main__':
-  main()
+    main()
diff --git a/data/MTA_2014_08_01.csv b/data/MTA_2014_08_01.csv
@@ -0,0 +1,3 @@
+Nome;Cognome
+Mario;Rossi
+Giacomo;Bianchi
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,7 +1,7 @@
 version: "3.3"
 services:
   spark-master:
-    image: cluster-apache-spark:3.0.2
+    image: cluster-apache-spark:3.4.0
     ports:
       - "9090:8080"
       - "7077:7077"
@@ -12,10 +12,10 @@ services:
       - SPARK_LOCAL_IP=spark-master
       - SPARK_WORKLOAD=master
   spark-worker-a:
-    image: cluster-apache-spark:3.0.2
+    image: cluster-apache-spark:3.4.0
     ports:
       - "9091:8080"
-      - "7000:7000"
+      - "7001:7000"
     depends_on:
       - spark-master
     environment:
@@ -30,10 +30,10 @@ services:
        - ./apps:/opt/spark-apps
        - ./data:/opt/spark-data
   spark-worker-b:
-    image: cluster-apache-spark:3.0.2
+    image: cluster-apache-spark:3.4.0
     ports:
       - "9092:8080"
-      - "7001:7000"
+      - "7002:7000"
     depends_on:
       - spark-master
     environment:
@@ -48,7 +48,7 @@ services:
         - ./apps:/opt/spark-apps
         - ./data:/opt/spark-data
   demo-database:
-    image: postgres:11.7-alpine
+    image: postgres:15
     ports: 
       - "5432:5432"
     environment: 

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# Ignore data files`
`2`		`-*.csv`
	`2`	`+#*.csv`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Nome;Cognome`
	`2`	`+Mario;Rossi`
	`3`	`+Giacomo;Bianchi`