dataDictionary = [
('James',{'hair':'black','eye':'brown'}),
('Michael',{'hair':'brown','eye':None}),
('Robert',{'hair':'red','eye':'black'}),
('Washington',{'hair':'red','eye':'grey'}),
('Jefferson',{'hair':'red','eye':''})
]
df = spark.createDataFrame(data=dataDictionary, schema = ["name","properties"])
df.printSchema()
df.columns
df.count()
df.select('name')
df.show(truncate=False)
dict2 = [
('James','James_last'),
('Michael','Michael_last'),
('Wendy', 'Wendy_last')
]
df2 = spark.createDataFrame(data=dict2, schema=['name','name_last'])
df2.show()
df.createOrReplaceTempView("PER")
spark.sql('select name from per where name like "J%"').show()
df2.createOrReplaceTempView("PER_LAST")
spark.sql('select * from per p1 full outer join per_last p2 on p1.name=p2.name').show()
spark.sql('select p1.name from per p1 full outer join per_last p2 on p1.name=p2.name').rdd.map(lambda x:x['name']).collect()
# ['James', 'Washington', 'Michael', 'Robert', 'Jefferson', None]
#Create PySpark DataFrame from Pandas
sparkDF = spark.createDataFrame(pandasDF)
No comments:
Post a Comment