Tuesday, March 30, 2021

PySpark Demo


dataDictionary = [
        ('James',{'hair':'black','eye':'brown'}),
        ('Michael',{'hair':'brown','eye':None}),
        ('Robert',{'hair':'red','eye':'black'}),
        ('Washington',{'hair':'red','eye':'grey'}),
        ('Jefferson',{'hair':'red','eye':''})
        ]

df = spark.createDataFrame(data=dataDictionary, schema = ["name","properties"])
df.printSchema()
df.columns
df.count()
df.select('name')
df.show(truncate=False)

dict2 = [
('James','James_last'),
('Michael','Michael_last'),
('Wendy', 'Wendy_last')
]
df2 = spark.createDataFrame(data=dict2, schema=['name','name_last'])
df2.show()

df.createOrReplaceTempView("PER")
spark.sql('select name from per where name like "J%"').show()

df2.createOrReplaceTempView("PER_LAST")
spark.sql('select * from per p1 full outer join per_last p2 on p1.name=p2.name').show()

spark.sql('select p1.name from per p1 full outer join per_last p2 on p1.name=p2.name').rdd.map(lambda x:x['name']).collect()
# ['James', 'Washington', 'Michael', 'Robert', 'Jefferson', None]

#Create PySpark DataFrame from Pandas
sparkDF = spark.createDataFrame(pandasDF) 

No comments: