cs190-2020-winter: assignment6-solution.ipynb

File assignment6-solution.ipynb, 5.7 KB (added by alsaudia, 6 years ago)

HW6-solution

Line 
1{"cells":[{"cell_type":"markdown","source":["Assignment 6"],"metadata":{}},{"cell_type":"markdown","source":["setup environment (must run first)"],"metadata":{}},{"cell_type":"code","source":["%python\nbusiness_df = spark.read.option(\"multiLine\", True).json(\"/FileStore/tables/business.json\")\nbusiness_df.cache() # Cache data for faster reuse\nreview_df = spark.read.option(\"multiLine\", True).json(\"/FileStore/tables/review.json\")\nreview_df.cache() # Cache data for faster reuse\nuser_df = spark.read.option(\"multiLine\", True).json(\"/FileStore/tables/user.json\")\nuser_df.cache() # Cache data for faster reuse\ntip_df = spark.read.option(\"multiLine\", True).json(\"/FileStore/tables/tip.json\")\ntip_df.cache() # Cache data for faster reuse\nphoto_df = spark.read.option(\"multiLine\", True).json(\"/FileStore/tables/photo.json\")\nphoto_df.cache() # Cache data for faster reuse\ncheckin_df = spark.read.option(\"multiLine\", True).json(\"/FileStore/tables/checkin.json\")\ncheckin_df.cache() # Cache data for faster reuse\n\nbusiness_df.createOrReplaceTempView(\"business\")\nreview_df.createOrReplaceTempView(\"review\")\nuser_df.createOrReplaceTempView(\"user\")\ntip_df.createOrReplaceTempView(\"tip\")\nphoto_df.createOrReplaceTempView(\"photo\")\ncheckin_df.createOrReplaceTempView(\"checkin\")"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"markdown","source":["Solution"],"metadata":{}},{"cell_type":"code","source":["#1.A\nbusiness_df.printSchema()\n#1.B hours data type is: struct (nullable)"],"metadata":{},"outputs":[],"execution_count":5},{"cell_type":"code","source":["%sql\n--1.C\nDESCRIBE user;\n--1.D elite data type is: array<string>\t"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"code","source":["#2.A DF\ndf = photo_df.filter(photo_df['business_id'] == '-AtzcXIwEP6yO7rM9CM9ww')\ndisplay(df)"],"metadata":{},"outputs":[],"execution_count":7},{"cell_type":"code","source":["%sql\n--2.A SQL\nSELECT *\nFROM photo\nWHERE business_id = '-AtzcXIwEP6yO7rM9CM9ww';"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"code","source":["#2.B DF\ncnt = business_df.filter(business_df['is_open'] == 1).count()\nprint(cnt)"],"metadata":{},"outputs":[],"execution_count":9},{"cell_type":"code","source":["%sql\n--2.B SQL\nSELECT COUNT(*)\nFROM business\nWHERE is_open = 1;"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"code","source":["#2.C DF\nimport pyspark.sql.functions as func\ndf = tip_df.filter(tip_df['compliment_count'] > 0).groupBy(\"user_id\").agg(func.count(\"*\").alias(\"cnt\")).sort(func.desc(\"cnt\")).select(\"user_id\").limit(5)\ndisplay(df)"],"metadata":{},"outputs":[],"execution_count":11},{"cell_type":"code","source":["%sql\n--2.C SQL\nSELECT user_id\nFROM tip \nWHERE compliment_count > 0\nGROUP BY user_id\nORDER BY COUNT(*) DESC\nLIMIT 5;"],"metadata":{},"outputs":[],"execution_count":12},{"cell_type":"code","source":["#2.D DF\nimport pyspark.sql.functions as func\ndf = business_df.select('business_id', func.explode('categories').alias('cat'))\ndf = df.groupBy('cat').agg(func.count('business_id').alias('cnt')).sort(func.desc('cnt')).limit(1).select('cat')\ndisplay(df)"],"metadata":{},"outputs":[],"execution_count":13},{"cell_type":"code","source":["%sql\n--2.D SQL\nSELECT cat\nFROM business LATERAL VIEW explode(categories) AS cat\nGROUP BY cat\nORDER BY COUNT(*) DESC\nLIMIT 1;\n"],"metadata":{},"outputs":[],"execution_count":14},{"cell_type":"code","source":["#2.E DF\nbusi_df = business_df.filter(business_df['postal_code'] == 15071).select('business_id')\ndfr = review_df.alias('r')\ndf = dfr.join(busi_df, dfr.business_id == busi_df.business_id)\ndf = df.select('r.*')\ndisplay(df)"],"metadata":{},"outputs":[],"execution_count":15},{"cell_type":"code","source":["%sql\n--2.E SQL\nSELECT r.*\nFROM review AS r, business AS b\nWHERE r.business_id = b.business_id\nAND b.postal_code = 15071;"],"metadata":{},"outputs":[],"execution_count":16},{"cell_type":"code","source":["#2.F DF\nimport pyspark.sql.functions as func\nchk_df = checkin_df.filter(func.size(checkin_df['date']) > 50).alias('c')\npbo_df = photo_df.alias('p')\nbus_df = business_df.alias('b')\ndf_bus_join_chk = bus_df.join(chk_df, bus_df.business_id == chk_df.business_id)\ndf = df_bus_join_chk.join(pbo_df, bus_df.business_id == pbo_df.business_id).groupby('b.business_id','b.name').agg(func.count('photo_id').alias('cnt'))\ndf = df.filter(df['cnt'] > 50).select('b.name')\ndisplay(df)"],"metadata":{},"outputs":[],"execution_count":17},{"cell_type":"code","source":["%sql\n--2.F SQL\nSELECT b.name\nFROM business AS b, checkin AS c, photo AS p\nWHERE b.business_id = c.business_id\nAND b.business_id = p.business_id\nAND size(c.date) > 50\nGROUP BY b.business_id, b.name\nHAVING COUNT(p.photo_id) > 50"],"metadata":{},"outputs":[],"execution_count":18},{"cell_type":"code","source":["%sql\n--2.G SQL\nWITH cte AS \n(SELECT user_id, MONTH(date) as month, COUNT(review_id) count_reviews, AVG(stars) AS avg_stars\nFROM review\nGROUP BY user_id, month\nORDER BY count_reviews DESC)\n\nSELECT w1.user_id, w1.month, w1.count_reviews, w1.avg_stars\nFROM cte AS w1, (SELECT user_id, First(month) as month\nFROM cte GROUP BY user_id) AS w2\nWHERE w1.user_id = w2.user_id AND w1.month = w2.month\nORDER BY w1.user_id;"],"metadata":{},"outputs":[],"execution_count":19},{"cell_type":"code","source":["%sql\n--2.H SQL\nSELECT user_id, review_id, date, useful, SUM(useful) OVER (\n PARTITION BY user_id ORDER BY CAST(date AS timestamp)\n RANGE BETWEEN INTERVAL '7' DAYS PRECEDING AND CURRENT ROW\n ) AS running_useful\nFROM review\nORDER BY user_id;"],"metadata":{},"outputs":[],"execution_count":20}],"metadata":{"name":"assignment6","notebookId":1223256428313787},"nbformat":4,"nbformat_minor":0}