datasets

Published by onesixx on

pydataset

from pydataset import data
data()

mtcars = data('mtcars')
print(mtcars.head())

vega_datasets

import vega_datasets as data
data.cars()
data('iris')
data('7zip')


['7zip',
 'airports',
 'annual-precip',
 'anscombe',
 'barley',
 'birdstrikes',
 'budget',
 'budgets',
 'burtin',
 'cars',
 'climate',
 'co2-concentration',
 'countries',
 'crimea',
 'disasters',
 'driving',
 'earthquakes',
 'ffox',
 'flare',
 'flare-dependencies',
 'flights-10k',
 'flights-200k',
 'flights-20k',
 'flights-2k',
 'flights-3m',
 'flights-5k',
 'flights-airport',
 'gapminder',
 'gapminder-health-income',
 'gimp',
 'github',
 'graticule',
 'income',
 'iowa-electricity',
 'iris',
 'jobs',
 'la-riots',
 'londonBoroughs',
 'londonCentroids',
 'londonTubeLines',
 'lookup_groups',
 'lookup_people',
 'miserables',
 'monarchs',
 'movies',
 'normal-2d',
 'obesity',
 'ohlc',
 'points',
 'population',
 'population_engineers_hurricanes',
 'seattle-temps',
 'seattle-weather',
 'sf-temps',
 'sp500',
 'stocks',
 'udistrict',
 'unemployment',
 'unemployment-across-industries',
 'uniform-2d',
 'us-10m',
 'us-employment',
 'us-state-capitals',
 'volcano',
 'weather',
 'weball26',
 'wheat',
 'windvectors',
 'world-110m',
 'zipcodes']
import vega_datasets as ds
ds.data.cars()
ds.data('iris')
#---------------------------------------------------------------------------
ds.data('7zip')
# error [SSL: CERTIFICATE_VERIFY_FAILED] 
# certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1006)>

# 인증서 설치
import certifi
import ssl
import urllib.request

#  Python에 설치된 CA(인증 기관)의 정보를 찾기
print(certifi.where())
# ❯ python -m certifi
# /Users/onesixx/Library/r-miniconda/envs/py11/lib/python3.11/site-packages/certifi/cacert.pem
cafile = certifi.where()
ssl_context = ssl.create_default_context(cafile=cafile)

# export SSL_CERT_FILE=$(python -m certifi) 
# export REQUESTS_CA_BUNDLE=$(python -m certifi)

# urllib의 urlopen에 SSL 컨텍스트를 사용하도록 설정
url = ds.data('7zip') # '7zip' 데이터셋의 URL을 얻습니다.
request = urllib.request.urlopen(url, context=ssl_context)  # SSL 컨텍스트를 사용하여 요청 실행

# 요청 결과를 처리합니다. (예: 데이터를 읽고 파싱)
# 이 부분은 실제 데이터셋에 따라 다를 수 있으므로 적절히 조정해야 합니다.
data = request.read()
print(data)  # 또는 데이터를 파싱하여 사용

seaborn

import seaborn as sns
titanic = sns.load_dataset('titanic')

sns.get_dataset_names()
['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

for i in sns.get_dataset_names():
    print(i)  
    df  = sns.load_dataset(i)
    print(df.head())

import seaborn as sns
import pandas as pd

# Load dataset
df = sns.load_dataset("dowjones")
# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])
# Convert 'Date' column to string format with desired date format
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

plotly.express

import plotly.express as px
iris = px.data.iris()
from palmerpenguins import load_penguins
penguins = load_penguins()
# pip install scikit-learn

import pandas as pd

### 1. scikit-learn datasets
import sklearn.datasets as sk_datasets
[i for i in dir(sk_datasets) if 'load' in i]

iris = sk_datasets.load_iris()[0]  # Access the first element of the tuple
print(iris.DESCR)
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['Species'] = iris.target

### 2. statsmodels datasets
import statsmodels.api as sm
[i for i in dir(sm.datasets) if not i.startswith('_')]

df = sm.datasets.get_rdataset('iris').data

### 3. vega_datasets
import vega_datasets as vega
dir(vega.data)

df = vega.data.iris()

# from tabulate import tabulate
# print(tabulate(df.sample(10), headers='keys'))
Categories: Python Basic

onesixx

Blog Owner

Subscribe
Notify of
guest

0 Comments
Oldest
Newest Most Voted
Inline Feedbacks
View all comments
0
Would love your thoughts, please comment.x
()
x