{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Expanding the measurement of culture with a sample of two billion humans" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "## Replication Data and Code" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "This notebook provides all the steps to replicate the results of our paper [Expanding the measurement of culture with a sample of two billion humans](https://doi.org/10.1098/rsif.2022.0085) published in the *Journal of the Royal Society Interface 19:20220085* (2022)." ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "## Robustness Analysis\n", "## Robustness to Penetration above Median" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Setup and Pre-requisites" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "Let's start by importing the required packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "#%pylab --no-import-all\n", "%matplotlib inline\n", "\n", "import sys, os, time\n", "import numpy as np\n", "import pandas as pd\n", "pd.set_option('display.width', 160)\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import matplotlib.patches as mpatches\n", "import statsmodels.api as sm\n", "import statsmodels.formula.api as smf\n", "from statsmodels.iolib.summary2 import summary_col\n", "from sklearn.metrics.pairwise import cosine_similarity, cosine_distances, manhattan_distances, pairwise_distances\n", "from scipy.stats import zscore\n", "from scipy.cluster.hierarchy import dendrogram, linkage\n", "from scipy import spatial, stats\n", "from scipy.stats import zscore\n", "import MantelTest.MantelTest as MantelTest\n", "import re\n", "import seaborn as sns" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "Let's setup our paths" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "mytype = 'penetration'\n", "cut = 'above'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "pathfb = './data/'\n", "pathfbor = './data/OriginalData/'\n", "pathregs = pathfb + 'Regs/'\n", "pathsamples = pathfb + '/DemographicData/' \n", "\n", "pathout = pathregs + 'Representativeness/'\n", "if os.path.exists(pathout) == False:\n", " os.mkdir(pathout)\n", "pathshare = pathout\n", "if os.path.exists(pathfbor) == False:\n", " os.mkdir(pathfbor)\n", "\n", "pathout = pathregs + 'Representativeness/' + mytype + '/'\n", "if os.path.exists(pathout) == False:\n", " os.mkdir(pathout)\n", "pathout = pathout + cut + '/'\n", "if os.path.exists(pathout) == False:\n", " os.mkdir(pathout)\n", "pathshare = pathout " ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Import Data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/ozak/anaconda3/envs/GeoPython39env/lib/python3.9/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Unknown extension is not supported and will be removed\n", " warn(msg)\n" ] } ], "source": [ "# Type of distance measure\n", "m = 'Cos'\n", "\n", "# Import all data\n", "df = pd.read_stata(pathregs + 'AllDistsFull.dta')\n", "\n", "# Import all distances\n", "mypairs = pd.read_stata(pathregs + 'AllDists.dta')\n", "mypairs.drop([x for x in mypairs.columns if x.endswith('uk') or x.endswith('usa')], inplace=True, axis=1)\n", "mypairs.drop([x for x in mypairs.columns if x.find('cognate')!=-1], inplace=True, axis=1)\n", "\n", "# Import samples\n", "sample_all = pd.read_excel(pathsamples + 'SampleCountries.xlsx', sheet_name=1, keep_default_na=False, na_values=[''])\n", "sample_wvs = pd.read_excel(pathsamples + 'SampleCountries.xlsx', sheet_name=2, keep_default_na=False, na_values=[''])\n", "sample_repr = pd.read_stata(pathsamples + 'representative_all.dta')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | countrycode3 | \n", "pop_tot | \n", "shareWDI_age | \n", "shareWDI_gender | \n", "countryname | \n", "fbpenetration | \n", "WVS | \n", "shareFB_age | \n", "shareFB_gender | \n", "countrynameFB | \n", "penetrationWDI_age | \n", "penetration_tot | \n", "diffgender | \n", "diffage | \n", "pctage | \n", "pctgender | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "CHN | \n", "1.386395e+09 | \n", "0.288391 | \n", "0.514821 | \n", "China | \n", "0.00 | \n", "1.0 | \n", "0.657890 | \n", "0.615631 | \n", "CN | \n", "0.002946 | \n", "0.004544 | \n", "0.100810 | \n", "0.369499 | \n", "0.052015 | \n", "0.007033 | \n", "
1 | \n", "TCD | \n", "1.489999e+07 | \n", "0.553384 | \n", "0.499404 | \n", "Chad | \n", "0.02 | \n", "NaN | \n", "0.649067 | \n", "0.834220 | \n", "TD | \n", "0.015872 | \n", "0.009396 | \n", "0.334817 | \n", "0.095683 | \n", "0.079544 | \n", "0.016224 | \n", "
2 | \n", "TKM | \n", "5.758075e+06 | \n", "0.409793 | \n", "0.490754 | \n", "Turkmenistan | \n", "0.00 | \n", "NaN | \n", "0.327031 | \n", "0.472757 | \n", "TM | \n", "0.004564 | \n", "0.011462 | \n", "0.017997 | \n", "0.082761 | \n", "0.093721 | \n", "0.019765 | \n", "
3 | \n", "SSD | \n", "1.257571e+07 | \n", "0.515470 | \n", "0.499250 | \n", "South Sudan | \n", "0.01 | \n", "NaN | \n", "0.538432 | \n", "0.783775 | \n", "SS | \n", "0.031825 | \n", "0.017494 | \n", "0.284525 | \n", "0.022962 | \n", "0.108701 | \n", "0.027796 | \n", "
4 | \n", "CAF | \n", "4.659080e+06 | \n", "0.529828 | \n", "0.492986 | \n", "Central African Republic | \n", "0.02 | \n", "NaN | \n", "0.519295 | \n", "0.709647 | \n", "CF | \n", "0.043283 | \n", "0.023610 | \n", "0.216661 | \n", "0.010534 | \n", "0.122169 | \n", "0.041294 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
171 | \n", "BRN | \n", "4.286970e+05 | \n", "0.348671 | \n", "0.517849 | \n", "Brunei Darussalam | \n", "0.81 | \n", "NaN | \n", "0.434901 | \n", "0.563942 | \n", "BN | \n", "1.139706 | \n", "1.003039 | \n", "0.046093 | \n", "0.086230 | \n", "NaN | \n", "NaN | \n", "
172 | \n", "BHR | \n", "1.492584e+06 | \n", "0.339478 | \n", "0.660007 | \n", "Bahrain | \n", "0.70 | \n", "NaN | \n", "0.460658 | \n", "0.737863 | \n", "BH | \n", "0.920223 | \n", "1.004969 | \n", "0.077856 | \n", "0.121181 | \n", "NaN | \n", "NaN | \n", "
173 | \n", "KWT | \n", "4.136528e+06 | \n", "0.271252 | \n", "0.589644 | \n", "Kuwait | \n", "0.74 | \n", "NaN | \n", "0.433159 | \n", "0.708900 | \n", "KW | \n", "0.940266 | \n", "1.039519 | \n", "0.119256 | \n", "0.161907 | \n", "NaN | \n", "NaN | \n", "
174 | \n", "ARE | \n", "9.400145e+06 | \n", "0.286527 | \n", "0.759823 | \n", "United Arab Emirates | \n", "0.91 | \n", "NaN | \n", "0.475820 | \n", "0.742728 | \n", "AE | \n", "1.122771 | \n", "1.170195 | \n", "0.017095 | \n", "0.189293 | \n", "NaN | \n", "NaN | \n", "
175 | \n", "QAT | \n", "2.639211e+06 | \n", "0.369710 | \n", "0.791233 | \n", "Qatar | \n", "0.85 | \n", "NaN | \n", "0.571068 | \n", "0.762519 | \n", "QA | \n", "1.083616 | \n", "1.250374 | \n", "0.028714 | \n", "0.201358 | \n", "NaN | \n", "NaN | \n", "
176 rows × 16 columns
\n", "