1- // Databricks notebook source exported at Mon, 12 Dec 2016 15:11:23 UTC
1+ // Databricks notebook source exported at Mon, 12 Dec 2016 15:14:17 UTC
22// MAGIC %md
33// MAGIC # Tweet Anatomy & Transmission Tree
44// MAGIC
@@ -499,332 +499,3 @@ display(TTTDF.select($"CurrentTweetDate",$"CurrentTwID",$"CurrentTweet",$"TweetT
499499
500500// COMMAND ----------
501501
502-
503-
504- // COMMAND ----------
505-
506- // MAGIC %md
507- // MAGIC Count the number of tweets in the TTT Table
508-
509- // COMMAND ----------
510-
511- StreamedTweetsfClassDF .count()
512-
513- // COMMAND ----------
514-
515- // MAGIC %md
516- // MAGIC As shown in the schema and displayed table, the objects have been renamed. Details of the alias given to each selected object in the TTT function are described in Step 4 above.
517-
518- // COMMAND ----------
519-
520- StreamedTweetsfClassDF .printSchema()
521-
522- // COMMAND ----------
523-
524- StreamedTweetsfClassDF .take(5 )
525-
526- // COMMAND ----------
527-
528- // MAGIC %md
529- // MAGIC #### Step 5: Exploratory Analysis
530- // MAGIC The tweets dataset for the TTT table can be explored using either SQL operators or standard DataFrame operators
531-
532- // COMMAND ----------
533-
534- // MAGIC %md
535- // MAGIC **Step 5.1: Tweets Exploration using DataFrame Operators**
536-
537- // COMMAND ----------
538-
539- // MAGIC %md
540- // MAGIC **Number of Tweets by Type**
541-
542- // COMMAND ----------
543-
544- display(StreamedTweetsfClassDF .groupBy($" tweetType" ).count().orderBy($" count" .desc))
545-
546- // COMMAND ----------
547-
548- display(StreamedTweetsfClassDF .groupBy($" tweetType" ).count().orderBy($" count" .desc))
549-
550- // COMMAND ----------
551-
552- // MAGIC %md
553- // MAGIC **Number of Tweets by Verified Users**
554-
555- // COMMAND ----------
556-
557- display(StreamedTweetsfClassDF .filter($" IsVerified" === true ).groupBy($" tweetType" ).count().orderBy($" count" .desc))
558-
559- // COMMAND ----------
560-
561- // MAGIC %md
562- // MAGIC **Number of Tweets by Users who enabled their Geographical location**
563-
564- // COMMAND ----------
565-
566- display(StreamedTweetsfClassDF .filter($" IsGeoEnabled" === true ).groupBy($" tweetType" ).count().orderBy($" count" .desc))
567-
568- // COMMAND ----------
569-
570- // MAGIC %md
571- // MAGIC **Number of Retweets by Distinct Users for each Tweets Created by Democrats and Republicans Members**
572-
573- // COMMAND ----------
574-
575- display(
576- StreamedTweetsfClassDF .filter($" tweetType" === " ReTweet" )
577- .groupBy($" OPostUserSNinRT" ,$" CPostUserSN" )
578- .agg(max(" followersCount" ),max(" friendsCount" ),sum(" Weight" ).alias(" ReTweetCount" ))
579- .orderBy($" ReTweetCount" .desc)
580- )
581-
582- // COMMAND ----------
583-
584- // MAGIC %md
585- // MAGIC **Trump's & Clinton's Status Retweeted by Users with 2 or More Followers and Friends**
586-
587- // COMMAND ----------
588-
589- val minimumAgeSinceAccountCreatedInDays = 100
590-
591- // COMMAND ----------
592-
593- val TrumpClintonRetweets = StreamedTweetsfClassDF
594- .withColumn(" now" , lit(current_timestamp()))
595- .withColumn(" daysSinceUserCreated" ,datediff($" now" ,$" userCreatedAtDate" ))
596- .drop($" now" )
597- .filter($" daysSinceUserCreated" > minimumAgeSinceAccountCreatedInDays)
598- .filter($" tweetType" === " ReTweet" )
599- .filter($" OPostUserSNinRT" === " realDonaldTrump" || $" OPostUserSNinRT" === " HillaryClinton" )
600- .filter($" followersCount" > 2 && $" friendsCount" > 2 )// filtering accounts with <3 friends or followers
601- .cache()
602-
603- display(TrumpClintonRetweets )
604-
605- // COMMAND ----------
606-
607- // MAGIC %md
608- // MAGIC ** Filtered Retweets for Trump and Clinton **
609-
610- // COMMAND ----------
611-
612- val TrumpClintonRetweetPairs = StreamedTweetsfClassDF
613- .withColumn(" now" , lit(current_timestamp()))
614- .withColumn(" daysSinceUserCreated" ,datediff($" now" ,$" userCreatedAtDate" ))
615- .drop($" now" )
616- .filter($" daysSinceUserCreated" > minimumAgeSinceAccountCreatedInDays)
617- .filter($" tweetType" === " ReTweet" )
618- .filter($" OPostUserSNinRT" === " realDonaldTrump" || $" OPostUserSNinRT" === " HillaryClinton" )
619- .groupBy($" userCreatedAtDate" ,$" daysSinceUserCreated" ,$" OPostUserSNinRT" ,$" CPostUserSN" )
620- .agg(max(" favouritesCount" ),max(" followersCount" ),max(" friendsCount" ),sum(" Weight" ).alias(" ReTweetCount" ))
621- .filter($" max(followersCount)" > 2 && $" max(friendsCount)" > 2 )// filtering accounts with <3 friends or followers
622- .orderBy($" ReTweetCount" .desc)
623- .cache()
624-
625- display(TrumpClintonRetweetPairs )
626-
627- // COMMAND ----------
628-
629- TrumpClintonRetweetPairs .agg(sum($" RetweetCount" )).show() // Sum of all retweets based on the >100 minimum account age filter
630-
631- // COMMAND ----------
632-
633- // MAGIC %md
634- // MAGIC **Filtered Retweets for Trump and Clinton by Geographical Enabled Users**
635-
636- // COMMAND ----------
637-
638- val TrumpClintonRetweetPairsGeoEnabled = StreamedTweetsfClassDF
639- .withColumn(" now" , lit(current_timestamp()))
640- .withColumn(" daysSinceUserCreated" ,datediff($" now" ,$" userCreatedAtDate" ))
641- .drop($" now" )
642- .filter($" daysSinceUserCreated" > minimumAgeSinceAccountCreatedInDays)
643- .filter($" IsGeoEnabled" === true )
644- .filter($" tweetType" === " ReTweet" )
645- .filter($" OPostUserSNinRT" === " realDonaldTrump" || $" OPostUserSNinRT" === " HillaryClinton" )
646- .groupBy($" userCreatedAtDate" ,$" daysSinceUserCreated" ,$" OPostUserSNinRT" ,$" CPostUserSN" )
647- .agg(max(" favouritesCount" ),max(" followersCount" ),max(" friendsCount" ),sum(" Weight" ).alias(" ReTweetCount" ))
648- .filter($" max(followersCount)" > 2 && $" max(friendsCount)" > 2 )// filtering accounts with <3 friends or followers
649- .orderBy($" ReTweetCount" .desc)
650- .cache()
651-
652- display(TrumpClintonRetweetPairsGeoEnabled )
653-
654- // COMMAND ----------
655-
656- TrumpClintonRetweetPairsGeoEnabled .agg(sum($" RetweetCount" )).show() // Sum of all retweets based on the >100 minimum account age filter
657-
658- // COMMAND ----------
659-
660- // MAGIC %md
661- // MAGIC **Filtered Retweets for Trump and Clinton by Users with Verified Account**
662-
663- // COMMAND ----------
664-
665- val TrumpClintonRetweetPairsVerified = StreamedTweetsfClassDF
666- .withColumn(" now" , lit(current_timestamp()))
667- .withColumn(" daysSinceUserCreated" ,datediff($" now" ,$" userCreatedAtDate" ))
668- .drop($" now" )
669- .filter($" daysSinceUserCreated" > minimumAgeSinceAccountCreatedInDays)
670- .filter($" IsVerified" === true )
671- .filter($" tweetType" === " ReTweet" )
672- .filter($" OPostUserSNinRT" === " realDonaldTrump" || $" OPostUserSNinRT" === " HillaryClinton" )
673- .groupBy($" userCreatedAtDate" ,$" daysSinceUserCreated" ,$" OPostUserSNinRT" ,$" CPostUserSN" ,$" CPostUserID" )
674- .agg(max(" favouritesCount" ),max(" followersCount" ),max(" friendsCount" ),sum(" Weight" ).alias(" ReTweetCount" ))
675- .filter($" max(followersCount)" > 2 && $" max(friendsCount)" > 2 )// filtering accounts with <3 friends or followers
676- .orderBy($" ReTweetCount" .desc)
677- .cache()
678-
679- display(TrumpClintonRetweetPairsVerified )
680-
681- // COMMAND ----------
682-
683- TrumpClintonRetweetPairsVerified .agg(sum($" RetweetCount" )).show() // Sum of all retweets based on the >100 minimum account age filter
684-
685- // COMMAND ----------
686-
687- // MAGIC %md
688- // MAGIC **Step 5.2: Register the Data Frame as a Table and perform SQL operations**
689-
690- // COMMAND ----------
691-
692- StreamedTweetsfClassDF .registerTempTable(" TTT" )
693-
694- // COMMAND ----------
695-
696- // MAGIC %md
697- // MAGIC **Description of the SQL Table**
698-
699- // COMMAND ----------
700-
701- display(sql("""
702- DESCRIBE TTT
703- """ ))
704-
705- // COMMAND ----------
706-
707- // MAGIC %md
708- // MAGIC **Number of Tweets by Type**
709-
710- // COMMAND ----------
711-
712- display(sql("""
713- SELECT count(*) as count, TweetType
714- FROM TTT
715- GROUP BY TweetType
716- ORDER BY count DESC
717- """ ))
718-
719- // COMMAND ----------
720-
721- // MAGIC %md
722- // MAGIC **Number of User Mention by Type**
723-
724- // COMMAND ----------
725-
726- display(sql("""
727- SELECT count(*) as count, MentionType
728- FROM TTT
729- GROUP BY MentionType
730- ORDER BY count DESC
731- """ ))
732-
733- // COMMAND ----------
734-
735- // MAGIC %md
736- // MAGIC **To validate the TTT function, the following number of tweet and mention type must be equal:**
737- // MAGIC
738- // MAGIC * Retweet === Retweet Mention
739- // MAGIC * Original + Reply Tweet === Authored Mention
740- // MAGIC * Quoted Tweet + Reply of Quoted Tweet === Quoted Mention
741- // MAGIC * Retweet of Quoted Tweet === Retweet_and_Quoted Mention
742-
743- // COMMAND ----------
744-
745- // MAGIC %md
746- // MAGIC **Sources of Retweet and Retweet of Quoted Tweet by Original Authors**
747-
748- // COMMAND ----------
749-
750- display(sql("""
751- SELECT COUNT(OPostUserNameinRT) as Count, OPostUserNameinRT, tweettype
752- FROM TTT
753- GROUP BY OPostUserNameinRT, tweettype
754- ORDER BY Count DESC, OPostUserNameinRT DESC
755- LIMIT 10
756- """ ))
757-
758- // COMMAND ----------
759-
760- display(sql("""
761- SELECT COUNT(OPostUserNameinRT) as Count, OPostUserNameinRT, tweettype
762- FROM TTT
763- GROUP BY OPostUserNameinRT, tweettype
764- ORDER BY Count DESC, OPostUserNameinRT DESC
765- LIMIT 5
766- """ ))
767-
768- // COMMAND ----------
769-
770- // MAGIC %md
771- // MAGIC **Sources of Retweet Mentions by Users of Original Post**
772-
773- // COMMAND ----------
774-
775- display(sql("""
776- SELECT COUNT(OPostUserNameinRT) as Count, OPostUserNameinRT, MentionType
777- FROM TTT
778- GROUP BY OPostUserNameinRT, MentionType
779- ORDER BY Count DESC, OPostUserNameinRT DESC
780- LIMIT 10
781- """ ))
782-
783- // COMMAND ----------
784-
785- // MAGIC %md
786- // MAGIC **Frequency of each distinct User Tweets being Retweeted**
787-
788- // COMMAND ----------
789-
790- display(sql("""
791- SELECT COUNT(OPostUserNameinRT) as Count, OPostUserNameinRT, tweettype
792- FROM TTT
793- WHERE tweettype == 'ReTweet'
794- GROUP BY OPostUserNameinRT, tweettype
795- ORDER BY Count DESC, OPostUserNameinRT DESC
796- LIMIT 10
797- """ ))
798-
799- // COMMAND ----------
800-
801- // MAGIC %md
802- // MAGIC **Number of times a User (CPostUserSN) Retweeted other Users (OPostUserNameinRT) previous Status**
803-
804- // COMMAND ----------
805-
806- display(sql("""
807- SELECT COUNT(CPostUserSN) as Count, CPostUserSN, OPostUserNameinRT, TweetType
808- FROM TTT
809- WHERE TweetType == 'ReTweet'
810- GROUP BY CPostUserSN, OPostUserNameinRT, TweetType
811- ORDER BY CPostUserSN ASC, Count DESC
812- LIMIT 10
813- """ ))
814-
815- // COMMAND ----------
816-
817- // MAGIC %md
818-
819- // COMMAND ----------
820-
821- // MAGIC %md
822-
823- // COMMAND ----------
824-
825- // unpersist the cached DFs
826- StreamedTweetsPQ .unpersist()
827-
828- StreamedTweetsfDF .unpersist()
829-
830- StreamedTweetsfClassDF .unpersist()
0 commit comments