Jul 4, 2024
my_env
).pip install pyspark
.import pyspark
in Python to ensure it was installed correctly.from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('practice').getOrCreate()
df = spark.read.csv('test1.csv', header=True, inferSchema=True)
df.show()
df.printSchema()
df.select('name', 'experience').show()
df = df.withColumn('experience_plus_two', df['experience'] + 2)
df.show()
df = df.drop('experience_plus_two')
df.show()
df = df.withColumnRenamed('name', 'new_name')
df.show()
df.filter(df['salary'] < 20000).show()
df.na.drop().show()
df.na.fill('missing_value', subset=['age', 'experience']).show()
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=['age', 'experience'], outputCols=['age_imputed', 'experience_imputed']).setStrategy('mean')
df = imputer.fit(df).transform(df)
df.show()
df.groupBy('department').sum('salary').show()
df.groupBy('department').mean('salary').show()
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['age', 'experience'], outputCol='features')
output = assembler.transform(df)
output.select('features', 'salary').show()
train_data, test_data = df.randomSplit([0.7, 0.3])
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features', labelCol='salary')
lr_model = lr.fit(train_data)
predictions = lr_model.transform(test_data)
predictions.show()