Jul 15, 2024
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, DateType
spark = SparkSession.builder.appName('IPLDataAnalysis').getOrCreate()
schema = StructType([
StructField('match_id', IntegerType(), True),
StructField('over_id', IntegerType(), True),
StructField('ball_id', IntegerType(), True),
StructField('innings_number', IntegerType(), True),
StructField('team_batting', StringType(), True),
StructField('team_bowling', StringType(), True)
#... additional fields
])
ball_by_ball_df = spark.read.format('csv').schema(schema).option('header', 'true').load('s3a://bucket/path/to/csv')
ball_by_ball_df.createOrReplaceTempView('ball_by_ball')