基于pyspark的英雄联盟游戏数据分析
舟率率 12/17/2025 pythonscala
# 项目概况
# 数据类型
英雄联盟游戏数据
# 开发环境
centos7
# 软件版本
python3.8.18、hadoop3.2.0、spark3.1.2、scala2.12.18、jdk8
# 开发语言
python
# 开发流程
数据上传(hdfs)->数据预处理(pyspark)->数据分析(pyspark)->可视化(matplotlib)
# 可视化图表

# 操作步骤
# python安装包
# linux
pip3 install pandas==2.0.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install numpy==1.24.4 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install matplotlib==3.7.4 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install notebook==7.3.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install plotly==6.2.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install wordcloud==1.9.4 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 uninstall urllib3 -y
pip3 install urllib3==1.22 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install seaborn==0.13.2 -i https://pypi.tuna.tsinghua.edu.cn/simple
1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
# 启动Hadoop
# 离开安全模式: hdfs dfsadmin -safemode leave
# 启动hadoop
bash /export/software/hadoop-3.2.0/sbin/start-hadoop.sh
1
2
3
4
5
2
3
4
5

# 准备目录
mkdir -p /data/jobs/project/
cd /data/jobs/project/
# 上传 "project-pyspark-league-data-analysis" 整个文件夹 到 "/data/jobs/project/" 目录
# 上传 "league_data.xlsx" 文件 到 "/data/jobs/project/project-pyspark-league-data-analysis" 目录
# 上传 "league_data.xlsx" 文件 到 "/data/jobs/project/project-pyspark-league-data-analysis" 目录
# 上传 "league_data.xlsx" 文件 到 "/data/jobs/project/project-pyspark-league-data-analysis" 目录
# 将 xlsx 转换成 csv
python3 /data/jobs/project/project-pyspark-league-data-analysis/file_handler.py
ls /data/jobs/project/project-pyspark-league-data-analysis/league_data.csv
1
2
3
4
5
6
7
8
9
10
11
12
13
14
2
3
4
5
6
7
8
9
10
11
12
13
14
# 上传hdfs
cd /data/jobs/project/project-pyspark-league-data-analysis/
hdfs dfs -mkdir -p /user/lol/data/
hdfs dfs -rm -r /user/lol/data/*
hdfs dfs -put league_data.csv /user/lol/data/
hdfs dfs -ls /user/lol/data/
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
# jupyternotebook在linux的安装配置
# 生成配置文件
jupyter notebook --generate-config
1
2
3
2
3
# 密码生成
在linux终端输入ipython进入设置密码界面,然后使用以下命令生成密码(键盘输入密码:123456)
from jupyter_server.auth import passwd;
passwd()
# quit 退出
1
2
3
4
5
6
2
3
4
5
6
# 新增配置参数
~/.jupyter/jupyter_notebook_config.py文件中新增以下配置参数
# 在首行添加
from jupyter_server.auth import PasswordIdentityProvider
# 在文件最后添加
PasswordIdentityProvider.hashed_password = u"argon2:$argon2id$v=19$m=10240,t=10,p=8$5uOsl3QQlX9jjWhooywTLA$ey1K7rLkBElfg9ucAg5VTsArEHz2/6V8ZH/FDs1d4Qg"
c.ServerApp.ip = '*'
c.ServerApp.open_browser = False
c.ServerApp.port = 5173
c.ServerApp.allow_remote_access = True
1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
# 启动jupyter
# 前往指定目录
cd /data/jobs/project/project-pyspark-league-data-analysis/
# 开启服务器 http://master:5173
# 登录密码: 123456
PYSPARK_DRIVER_PYTHON=jupyter \
PYSPARK_DRIVER_PYTHON_OPTS="notebook --ip=0.0.0.0 --port=5173 --allow-root --no-browser" \
pyspark \
--driver-memory 2g \
--executor-memory 2g
1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12