基于pyspark的地震数据处理与分析
舟率率 7/18/2025
原地址:https://dblab.xmu.edu.cn/blog/2651/
# 项目概况
# 数据类型
1965-2016全球重大地震数据
# 开发环境
centos7
# 软件版本
python3.8.18
# 开发语言
python
# 操作步骤
# python安装包
# windows
pip install findspark==2.0.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install pandas==2.0.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install numpy==1.24.4 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install matplotlib==3.7.4 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install notebook==7.3.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install plotly==6.2.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install wordcloud==1.9.4 -i https://pypi.tuna.tsinghua.edu.cn/simple
# linux
pip3 install pandas==2.0.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install numpy==1.24.4 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install matplotlib==3.7.4 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install notebook==7.3.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install plotly==6.2.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install wordcloud==1.9.4 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 uninstall urllib3 -y
pip3 install urllib3==1.22 -i https://pypi.tuna.tsinghua.edu.cn/simple
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 准备目录
mkdir -p /data/jobs/project/
cd /data/jobs/project/
# 解压 "data" 目录下的 "data.zip" 文件
# 上传 "project-pyspark-earthquake-data-analysis" 整个文件夹 到 "/data/jobs/project/" 目录
1
2
3
4
5
6
7
2
3
4
5
6
7
# jupyternotebook在linux的安装配置
# 生成配置文件
jupyter notebook --generate-config
1
2
3
2
3
# 密码生成
在linux终端输入ipython
进入设置密码界面,然后使用以下命令生成密码(键盘输入密码:123456)
from jupyter_server.auth import passwd;
passwd()
# quit 退出
1
2
3
4
5
6
2
3
4
5
6
# 新增配置参数
~/.jupyter/jupyter_notebook_config.py
文件中新增以下配置参数
# 在首行添加
from jupyter_server.auth import PasswordIdentityProvider
# 在文件最后添加
PasswordIdentityProvider.hashed_password = u"argon2:$argon2id$v=19$m=10240,t=10,p=8$qkoJi4IXkvNJ0PI3jvnwQQ$pCvwhvKo7clIMNAaq6Ox4sH6UfL8sXhQRCxTq7dGTVY"
c.ServerApp.ip = '*'
c.ServerApp.open_browser = False
c.ServerApp.port = 5173
c.ServerApp.allow_remote_access = True
1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
# 启动jupyter
# 前往指定目录
cd /data/jobs/project/project-pyspark-earthquake-data-analysis/
# 开启服务器 http://master:5173
# 登录密码: 123456
jupyter notebook --allow-root --no-browser
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8