Skip to content

Commit

Permalink
fpu14
Browse files Browse the repository at this point in the history
  • Loading branch information
Brem090 committed Jan 31, 2025
1 parent c38aabf commit 90575a5
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 6 deletions.
Binary file modified encrypted_file/eugene_etl_pipeline.tar_encrypted.tar.gz
Binary file not shown.
8 changes: 6 additions & 2 deletions eugene/bronze_to_silver.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import StringType
import re, os, logging

Expand Down Expand Up @@ -36,7 +36,11 @@ def process_table(spark, table_name):

# Видалення дублікатів рядків
df = df.dropDuplicates()


# Перевірка та заповнення відсутніх значень у стовпці medal значенням "No Medal"
if "medal" in df.columns:
df = df.withColumn("medal", col("medal").na.fill("No Medal"))

# Логування попереднього перегляду
logger.info(f"Data preview after cleaning for {table_name}:")
df.show(20)
Expand Down
8 changes: 4 additions & 4 deletions eugene/silver_to_gold.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
bio_df = spark.read.parquet("silver/athlete_bio", schema=bio_schema)
results_df = spark.read.parquet("silver/athlete_event_results")

# Фільтрація медалей
valid_medals = ["Gold", "Silver", "Bronze"]
results_selected = results_df.select("athlete_id", "sport", "medal").filter(col("medal").isin(valid_medals))
# Вибір колонок з athlete_event_results та обробка відсутніх значень у колонці medal
results_selected = results_df.select("athlete_id", "sport", "medal") \
.withColumn("medal", col("medal").na.fill("No Medal"))

# Вибір колонок з athlete_bio
bio_selected = bio_df.select("athlete_id", "sex", "country_noc", "weight", "height")
Expand Down Expand Up @@ -63,4 +63,4 @@
filtered_stats.repartition("sport", "medal").write.parquet("gold/avg_stats", mode="overwrite")

# Завершення сесії
spark.stop()
spark.stop()

0 comments on commit 90575a5

Please sign in to comment.