Published on 01 May 2021 in Data Science

How to use MongoDb with Python

This little tutorial shows how to use MongoDB with python.

from pymongo import MongoClient
import pprint
import pandas as pd
#Connect to mongodb
client = MongoClient('mongodb://localhost:27017/', readPreference='primaryPreferred')
#Connect to spark db
db = client['Connect_to_Spark_db']
mentions_events = db['mentions_events']
pprint.pprint(mentions_events.find_one())

Test request no. Of actors

agg=db.mentions_events.aggregate([ 
{ "$limit": 5000000},
{
       "$group": {
           "_id": {"actor":"$Actor1Name"},
           "Count": { "$sum": 1}
       },
     
    
},
{ "$count": "myCount" }
])

Request 1

# Requete 1
agg=db.mentions_events.aggregate([
   { "$limit": 10000},
   {
       "$group": {
           "_id": {"annee": "$yearmention", "mois": "$monthmention", "jour": "$daymonthevent", "pays":"$ActionGeo_CountryCode", "langue":"$MentionDocTranslationInfo"},
           "Nb_Articles": { "$sum": "$NumMentions" },
       }
   }
])
pprint.pprint(list(agg))

Request 2

agg=db.mentions_events.aggregate( [
    { $match:
        { $or: [{"Actor1Name":"TOURIST"},{"Actor2Name":"TOURIST"}] },
    },

    { $group:
        {_id:
            {GLOBALEVENTID:"$GLOBALEVENTID",
            Actor1Code:"$Actor1Code",
            Actor1Name:"$Actor1Name",
            Actor1CountryCode:"$Actor1CountryCode",
            Actor2Code:"$Actor2Code",
            Actor2Name:"$Actor2Name",
            Actor2CountryCode:"$Actor2CountryCode",
            EventCode:"$EventCode",
            QuadClass:"$QuadClass",
            SQLDATE:"$SQLDATE",
            SOURCEURL:"$SOURCEURL"
            }
        }
    }
] );

Request 3

df_correspondances = 

agg=db.mentions_events.aggregate([
   {
       "$project": {
           "monthmention": 1, "ActionGeo_CountryCode": 1, "MentionDocTranslationInfo" : 1, "Actor1Name" : 1,
           "lessThan0": { 
               "$cond": [ { "$lt": ["MentionDocTone", 0 ] }, 1, 0]
           },
           "moreThan0": { 
               "$cond": [ { "$gt": [ "MentionDocTone", 0 ] }, 1, 0]
           }
       }
   },
   { "$limit": 1000000},
   {
       "$group": {
           "_id": {"mois": "$monthmention", "pays":"$ActionGeo_CountryCode", "langue":"$MentionDocTranslationInfo", "actor":"$Actor1Name"},
           "Nb_Articles_Negatifs": { "$sum": "$lessThan0" },
           "Nb_Articles_Positifs": { "$sum": "$moreThan0" }
       }
   }
])

pprint.pprint(list(agg))
req3 = list(agg)
df_3 = pd.Da
{'_id': {'mois': 1, 'pays': 'GM', 'langue': 'ara', 'actor': 'IRANIAN'},
 'Nb_Articles_Negatifs': 0,
 'Nb_Articles_Positifs': 3}

Request 4


agg = db.mentions_events.aggregate([
 { "$limit": 10000},
 { "$group": {
     "_id":   { "GLOBALEVENTID": "$GLOBALEVENTID", "Actor1Name": "$Actor1Name" },
     "avgGoldstein": { "$avg": "$GoldsteinScale" } }  },
 { "$group": {
     "_id": { "Actor1Name": "$_id.Actor1Name" },
     "totalGoldstein": { "$sum": "$avgGoldstein"} }  },
{"$sort":{"totalGoldstein":1}}
])

pprint.pprint(list(agg))