基于pyspark的英雄联盟游戏数据分析

12/17/2025 pythonscala

可视化效果视频 (opens new window)

# 项目概况

master (opens new window)

# 数据类型

英雄联盟游戏数据

# 开发环境

centos7

# 软件版本

python3.8.18、hadoop3.2.0、spark3.1.2、scala2.12.18、jdk8

# 开发语言

python

# 开发流程

数据上传(hdfs)->数据预处理(pyspark)->数据分析(pyspark)->可视化(matplotlib)

# 可视化图表

screen

# 操作步骤

# python安装包


# linux
pip3 install pandas==2.0.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install numpy==1.24.4 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install matplotlib==3.7.4 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install notebook==7.3.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install plotly==6.2.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install wordcloud==1.9.4 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 uninstall urllib3 -y
pip3 install urllib3==1.22 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install seaborn==0.13.2 -i https://pypi.tuna.tsinghua.edu.cn/simple

1
2
3
4
5
6
7
8
9
10
11
12

# 启动Hadoop


# 离开安全模式: hdfs dfsadmin -safemode leave
# 启动hadoop
bash /export/software/hadoop-3.2.0/sbin/start-hadoop.sh

1
2
3
4
5

hadoop

# 准备目录


mkdir -p /data/jobs/project/
cd /data/jobs/project/

# 上传 "project-pyspark-league-data-analysis" 整个文件夹 到 "/data/jobs/project/" 目录
# 上传 "league_data.xlsx" 文件 到 "/data/jobs/project/project-pyspark-league-data-analysis" 目录
# 上传 "league_data.xlsx" 文件 到 "/data/jobs/project/project-pyspark-league-data-analysis" 目录
# 上传 "league_data.xlsx" 文件 到 "/data/jobs/project/project-pyspark-league-data-analysis" 目录

# 将 xlsx 转换成 csv
python3 /data/jobs/project/project-pyspark-league-data-analysis/file_handler.py

ls /data/jobs/project/project-pyspark-league-data-analysis/league_data.csv

1
2
3
4
5
6
7
8
9
10
11
12
13
14

# 上传hdfs


cd /data/jobs/project/project-pyspark-league-data-analysis/

hdfs dfs -mkdir -p /user/lol/data/
hdfs dfs -rm -r /user/lol/data/*
hdfs dfs -put league_data.csv /user/lol/data/
hdfs dfs -ls /user/lol/data/

1
2
3
4
5
6
7
8

# jupyternotebook在linux的安装配置

# 生成配置文件


jupyter notebook --generate-config

1
2
3

# 密码生成

在linux终端输入ipython进入设置密码界面,然后使用以下命令生成密码(键盘输入密码:123456)


from jupyter_server.auth import passwd;
passwd()

# quit 退出

1
2
3
4
5
6

# 新增配置参数

~/.jupyter/jupyter_notebook_config.py文件中新增以下配置参数


# 在首行添加
from jupyter_server.auth import PasswordIdentityProvider


# 在文件最后添加
PasswordIdentityProvider.hashed_password = u"argon2:$argon2id$v=19$m=10240,t=10,p=8$5uOsl3QQlX9jjWhooywTLA$ey1K7rLkBElfg9ucAg5VTsArEHz2/6V8ZH/FDs1d4Qg"
c.ServerApp.ip = '*'
c.ServerApp.open_browser = False
c.ServerApp.port = 5173
c.ServerApp.allow_remote_access = True

1
2
3
4
5
6
7
8
9
10
11
12

# 启动jupyter


# 前往指定目录
cd /data/jobs/project/project-pyspark-league-data-analysis/

# 开启服务器 http://master:5173
# 登录密码: 123456
PYSPARK_DRIVER_PYTHON=jupyter \
PYSPARK_DRIVER_PYTHON_OPTS="notebook --ip=0.0.0.0 --port=5173 --allow-root --no-browser" \
pyspark \
--driver-memory 2g \
--executor-memory 2g

1
2
3
4
5
6
7
8
9
10
11
12
Last Updated: 12/23/2025, 1:40:47 AM