Move datasets to project ageron/data to shrink this repo

main
Aurélien Geron 2022-02-19 21:36:43 +13:00
parent 8745a9c2ac
commit c9b977309a
6 changed files with 27 additions and 31 deletions

21
.gitignore vendored
View File

@ -4,19 +4,14 @@
*.old *.old
*.pyc *.pyc
.DS_Store .DS_Store
.ipynb_checkpoints .ipynb_checkpoints/
.vscode/ .vscode/
checkpoint checkpoint
logs/* /logs
tf_logs/* /tf_logs
images/**/*.png /images
images/**/*.dot
my_* my_*
person.proto /person.proto
person.desc /person.desc
person_pb2.py /person_pb2.py
datasets/flowers /datasets
datasets/spam
datasets/words
datasets/jsb_chorales

View File

@ -130,11 +130,11 @@
"datapath = Path() / \"datasets\" / \"lifesat\"\n", "datapath = Path() / \"datasets\" / \"lifesat\"\n",
"datapath.mkdir(parents=True, exist_ok=True)\n", "datapath.mkdir(parents=True, exist_ok=True)\n",
"\n", "\n",
"root = \"https://raw.githubusercontent.com/ageron/handson-ml3/main/\"\n", "data_root = \"https://github.com/ageron/data/raw/main/\"\n",
"filename = \"lifesat.csv\"\n", "filename = \"lifesat.csv\"\n",
"if not (datapath / filename).is_file():\n", "if not (datapath / filename).is_file():\n",
" print(\"Downloading\", filename)\n", " print(\"Downloading\", filename)\n",
" url = root + \"datasets/lifesat/\" + filename\n", " url = data_root + \"lifesat/\" + filename\n",
" urllib.request.urlretrieve(url, datapath / filename)" " urllib.request.urlretrieve(url, datapath / filename)"
] ]
}, },
@ -283,7 +283,7 @@
"for filename in (\"oecd_bli.csv\", \"gdp_per_capita.csv\"):\n", "for filename in (\"oecd_bli.csv\", \"gdp_per_capita.csv\"):\n",
" if not (datapath / filename).is_file():\n", " if not (datapath / filename).is_file():\n",
" print(\"Downloading\", filename)\n", " print(\"Downloading\", filename)\n",
" url = root + \"datasets/lifesat/\" + filename\n", " url = data_root + \"lifesat/\" + filename\n",
" urllib.request.urlretrieve(url, datapath / filename)" " urllib.request.urlretrieve(url, datapath / filename)"
] ]
}, },

View File

@ -110,8 +110,8 @@
" housing_path = Path() / \"datasets\" / \"housing\"\n", " housing_path = Path() / \"datasets\" / \"housing\"\n",
" if not (housing_path / \"housing.csv\").is_file():\n", " if not (housing_path / \"housing.csv\").is_file():\n",
" housing_path.mkdir(parents=True, exist_ok=True)\n", " housing_path.mkdir(parents=True, exist_ok=True)\n",
" root = \"https://raw.githubusercontent.com/ageron/handson-ml3/main/\"\n", " data_root = \"https://github.com/ageron/data/raw/main/\"\n",
" url = root + \"datasets/housing/housing.tgz\"\n", " url = data_root + \"housing/housing.tgz\"\n",
" tgz_path = housing_path / \"housing.tgz\"\n", " tgz_path = housing_path / \"housing.tgz\"\n",
" urllib.request.urlretrieve(url, tgz_path)\n", " urllib.request.urlretrieve(url, tgz_path)\n",
" with tarfile.open(tgz_path) as housing_tgz:\n", " with tarfile.open(tgz_path) as housing_tgz:\n",
@ -578,8 +578,8 @@
"# Download the California image\n", "# Download the California image\n",
"filename = \"california.png\"\n", "filename = \"california.png\"\n",
"if not (IMAGES_PATH / filename).is_file():\n", "if not (IMAGES_PATH / filename).is_file():\n",
" root = \"https://raw.githubusercontent.com/ageron/handson-ml3/main/\"\n", " homl3_root = \"https://github.com/ageron/handson-ml3/raw/main/\"\n",
" url = root + \"images/end_to_end_project/\" + filename\n", " url = homl3_root + \"images/end_to_end_project/\" + filename\n",
" print(\"Downloading\", filename)\n", " print(\"Downloading\", filename)\n",
" urllib.request.urlretrieve(url, IMAGES_PATH / filename)\n", " urllib.request.urlretrieve(url, IMAGES_PATH / filename)\n",
"\n", "\n",

View File

@ -1635,8 +1635,8 @@
" filepath = titanic_path / filename\n", " filepath = titanic_path / filename\n",
" if filepath.is_file():\n", " if filepath.is_file():\n",
" continue\n", " continue\n",
" root = \"https://raw.githubusercontent.com/ageron/handson-ml3/main/\"\n", " data_root = \"https://github.com/ageron/data/raw/main/\"\n",
" url = root + \"/datasets/titanic/\" + filename\n", " url = data_root + \"titanic/\" + filename\n",
" print(\"Downloading\", filename)\n", " print(\"Downloading\", filename)\n",
" urllib.request.urlretrieve(url, filepath)\n", " urllib.request.urlretrieve(url, filepath)\n",
" return [pd.read_csv(titanic_path / filename) for filename in filenames]" " return [pd.read_csv(titanic_path / filename) for filename in filenames]"
@ -2123,9 +2123,9 @@
"import tarfile\n", "import tarfile\n",
"\n", "\n",
"def fetch_spam_data():\n", "def fetch_spam_data():\n",
" root = \"http://spamassassin.apache.org/old/publiccorpus/\"\n", " spam_root = \"http://spamassassin.apache.org/old/publiccorpus/\"\n",
" ham_url = root + \"20030228_easy_ham.tar.bz2\"\n", " ham_url = spam_root + \"20030228_easy_ham.tar.bz2\"\n",
" spam_url = root + \"20030228_spam.tar.bz2\"\n", " spam_url = spam_root + \"20030228_spam.tar.bz2\"\n",
"\n", "\n",
" spam_path = Path() / \"datasets\" / \"spam\"\n", " spam_path = Path() / \"datasets\" / \"spam\"\n",
" spam_path.mkdir(parents=True, exist_ok=True)\n", " spam_path.mkdir(parents=True, exist_ok=True)\n",

View File

@ -755,8 +755,8 @@
" housing_path = Path() / \"datasets\" / \"housing\"\n", " housing_path = Path() / \"datasets\" / \"housing\"\n",
" if not (housing_path / \"housing.csv\").is_file():\n", " if not (housing_path / \"housing.csv\").is_file():\n",
" housing_path.mkdir(parents=True, exist_ok=True)\n", " housing_path.mkdir(parents=True, exist_ok=True)\n",
" root = \"https://raw.githubusercontent.com/ageron/handson-ml3/main/\"\n", " root = \"https://github.com/ageron/data/raw/main/\"\n",
" url = root + \"datasets/housing/housing.tgz\"\n", " url = root + \"housing/housing.tgz\"\n",
" tgz_path = housing_path / \"housing.tgz\"\n", " tgz_path = housing_path / \"housing.tgz\"\n",
" urllib.request.urlretrieve(url, tgz_path)\n", " urllib.request.urlretrieve(url, tgz_path)\n",
" with tarfile.open(tgz_path) as housing_tgz:\n", " with tarfile.open(tgz_path) as housing_tgz:\n",

View File

@ -895,7 +895,6 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import urllib.request\n",
"from sklearn.datasets import fetch_openml\n", "from sklearn.datasets import fetch_openml\n",
"\n", "\n",
"mnist = fetch_openml('mnist_784', as_frame=False)" "mnist = fetch_openml('mnist_784', as_frame=False)"
@ -1303,14 +1302,16 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# extra code\n", "# extra code downloads the ladybug image\n",
"\n", "\n",
"root = \"https://raw.githubusercontent.com/ageron/handson-ml3/main/\"\n", "import urllib.request\n",
"\n",
"homl3_root = \"https://github.com/ageron/handson-ml3/raw/main/\"\n",
"filename = \"ladybug.png\"\n", "filename = \"ladybug.png\"\n",
"filepath = IMAGES_PATH / filename\n", "filepath = IMAGES_PATH / filename\n",
"if not filepath.is_file():\n", "if not filepath.is_file():\n",
" print(\"Downloading\", filename)\n", " print(\"Downloading\", filename)\n",
" url = f\"{root}/images/unsupervised_learning/{filename}\"\n", " url = f\"{homl3_root}/images/unsupervised_learning/{filename}\"\n",
" urllib.request.urlretrieve(url, filepath)" " urllib.request.urlretrieve(url, filepath)"
] ]
}, },