APK parsing basics - Android application package structure analysis and reverse technology
#Basics of APK analysis: from installation package to security analysis
I just helped a friend check a malicious APK disguised as a takeaway red envelope, and within 5 minutes I found out that it secretly read permissions for text messages and photo albums - all thanks to my familiarity with the APK structure and several lightweight tools.
Today we will start from four dimensions: Basic Essential Understanding, Lightweight Python Parsing, Decompilation Tool Integration, and Low-threshold Security Scanning to take you through the secrets of Android application packages. The full text is about 2,600 words, and the code can be copied and run directly, making it friendly to novices!
1. The essence of APK: just a ZIP compressed package with a tamper-proof signature
Many people think that APK is a mysterious and exclusive format, but in fact it is a standard PKZIP compressed package - you can even open it directly with Windows' own compression software or Mac's archiving tool to see the surface file structure.
However, there is one thing to pay special attention to: You cannot decompress and then compress again at will. Because the secondary compression algorithm, compression rate, and file order may be different from the original package, the Android system will verify it during installation.META-INF/If the signature hash in the directory does not match, the installation will be directly refused.
The following is the core file/directory structure of the APK, with tips to help you distinguish the functions of each directory:
Today's lightweight parser will not deeply parse the binary Manifest and DEX internal structure. It mainly completes "out-of-the-box" work: file metadata, resource statistics, CPU architecture support, hash calculation, etc. Requires only the Python standard library andpathlib, there is no need for complex third-party reverse libraries, and the entry barrier is almost zero!
# apk_analyzer.py
import zipfile
import hashlib
from typing import Dict, List, Optional
from pathlib import Path
class APKAnalyzer:
"""轻量 APK 解析器:获取表层元数据、文件结构统计"""
def __init__(self, apk_path: str):
self.apk_path = Path(apk_path)
# 预定义标准格式的返回结果字典
self.apk_info = {
"metadata": {},
"all_files": [],
"dex_files": [],
"native_libraries": {},
"resources": {}
}
def analyze(self) -> Dict:
"""执行完整的表层解析流程"""
# 第一步:检查 APK 文件是否存在
if not self.apk_path.exists():
raise FileNotFoundError(f"APK 文件未找到,请检查路径: {self.apk_path}")
# 第二步:获取 APK 文件的基础元数据
self._get_file_metadata()
# 第三步:用 zipfile 标准库遍历 APK 内部内容
with zipfile.ZipFile(self.apk_path, 'r') as zip_apk:
self.apk_info["all_files"] = zip_apk.namelist() # 获取所有文件名列表
self._get_dex_statistics(zip_apk) # 统计 DEX 分包情况
self._get_native_library_info(zip_apk) # 统计 SO 库和 CPU 架构
self._get_resource_statistics(zip_apk) # 统计各类资源数量
return self.apk_info
def _get_file_metadata(self):
"""获取 APK 文件的大小、SHA256、MD5 等元数据(用于安全溯源)"""
file_stat = self.apk_path.stat()
self.apk_info["metadata"] = {
"full_path": str(self.apk_path.resolve()),
"size_mb": round(file_stat.st_size / (1024 * 1024), 2),
"sha256": self._calculate_file_hash("sha256"),
"md5": self._calculate_file_hash("md5")
}
def _calculate_file_hash(self, algorithm: str) -> str:
"""通用的文件哈希计算函数(支持分块读取大文件)"""
hash_obj = hashlib.new(algorithm)
with open(self.apk_path, "rb") as f:
# 分块读取(每块 4KB),防止大文件占满内存
for chunk in iter(lambda: f.read(4096), b""):
hash_obj.update(chunk)
return hash_obj.hexdigest()
def _get_dex_statistics(self, zip_apk: zipfile.ZipFile):
"""统计 DEX 文件的数量和大小"""
dex_file_list = [f for f in zip_apk.namelist() if f.endswith(".dex")]
self.apk_info["dex_files"] = [
{
"filename": f,
"size_kb": round(len(zip_apk.read(f)) / 1024, 2)
} for f in dex_file_list
]
def _get_native_library_info(self, zip_apk: zipfile.ZipFile):
"""统计 SO 库的数量和支持的 CPU 架构"""
so_file_list = [f for f in zip_apk.namelist() if f.startswith("lib/") and f.endswith(".so")]
supported_archs = set()
for so_file in so_file_list:
# SO 文件路径格式:lib/CPU架构/xxx.so
arch = so_file.split("/")[1]
supported_archs.add(arch)
self.apk_info["native_libraries"] = {
"total_so_count": len(so_file_list),
"supported_cpu_arch": list(supported_archs)
}
def _get_resource_statistics(self, zip_apk: zipfile.ZipFile):
"""统计各类编译 / 原始资源的数量"""
all_resources = [f for f in zip_apk.namelist() if f.startswith(("res/", "assets/"))]
self.apk_info["resources"] = {
"total_resource_count": len(all_resources),
"drawable_icon_count": len([f for f in all_resources if "drawable" in f]),
"layout_page_count": len([f for f in all_resources if "layout" in f]),
"original_asset_count": len([f for f in all_resources if f.startswith("assets/")])
}
def main():
"""示例使用函数"""
# ⚠️ 请替换为真实的 APK 路径(当前目录下直接写文件名,否则写绝对路径)
APK_PATH = "test.apk"
try:
print(f"🚀 开始解析 APK: {APK_PATH}...")
analyzer = APKAnalyzer(APK_PATH)
apk_result = analyzer.analyze()
print("\n" + "="*30 + " APK 解析结果 " + "="*30)
print(f"📁 文件完整路径: {apk_result['metadata']['full_path']}")
print(f"⚖️ 文件大小: {apk_result['metadata']['size_mb']} MB")
print(f"🔐 SHA256 哈希: {apk_result['metadata']['sha256'][:16]}...") # 只显示前 16 位方便看
print(f"💻 DEX 文件数: {len(apk_result['dex_files'])}")
print(f"⚙️ 支持 CPU 架构: {', '.join(apk_result['native_libraries']['supported_cpu_arch'])}")
print(f"🎨 资源总数: {apk_result['resources']['total_resource_count']}")
except Exception as e:
print(f"❌ APK 解析失败: {e}")
if __name__ == "__main__":
main()
Lightweight analysis can only see surface information. If you want to get the plain text AndroidManifest.xml, deobfuscated Java/Kotlin code and editable resource files, you must use professional decompilation tools.
We can use Python'ssubprocessThe library quickly integrates these tools to achieve automated decompilation. Today we will first demonstrate the most commonly used jadx. The integration logic of other tools is similar to it, and you can expand accordingly.
jadx integration code
# apk_decompiler.py
import subprocess
import os
from typing import Optional
class APKDecompiler:
"""专业反编译工具集成类:目前仅支持 jadx"""
@staticmethod
def is_jadx_installed() -> bool:
"""检查 jadx 是否已安装并添加到系统 PATH"""
try:
# 执行 jadx --version 验证可用性
result = subprocess.run(
["jadx", "--version"],
capture_output=True,
text=True,
timeout=10
)
return result.returncode == 0
except Exception:
return False
@staticmethod
def decompile_apk_with_jadx(
apk_path: str,
output_dir: Optional[str] = None,
skip_resources: bool = False
) -> Optional[str]:
"""
使用 jadx 反编译 APK
参数:
apk_path: 待反编译的 APK 路径
output_dir: 反编译结果输出目录(默认自动生成)
skip_resources: 是否跳过资源反编译(仅反编译代码,速度更快)
"""
# 第一步:检查 jadx 是否可用
if not APKDecompiler.is_jadx_installed():
print("❌ 请先安装 jadx 并添加到系统 PATH!")
print("👉 下载地址:https://github.com/skylot/jadx/releases")
return None
# 第二步:设置默认输出目录
apk_filename = os.path.basename(apk_path).replace(".apk", "")
output_dir = output_dir or f"jadx_output_{apk_filename}"
# 第三步:构建 jadx 命令
cmd = ["jadx", "-d", output_dir, apk_path]
if skip_resources:
cmd.insert(1, "-r") # 插入 -r 参数跳过资源
try:
print(f"🚀 开始执行 jadx 反编译...")
print(f"📝 执行命令: {' '.join(cmd)}")
# 执行命令,超时设置为 300 秒(5 分钟),防止超大 APK 卡死
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300
)
if result.returncode == 0:
print(f"✅ jadx 反编译成功!")
print(f"📂 反编译结果输出目录: {os.path.abspath(output_dir)}")
return output_dir
else:
print(f"❌ jadx 反编译失败!")
print(f"🔍 错误信息: {result.stderr}")
return None
except subprocess.TimeoutExpired:
print("❌ jadx 反编译超时(超过 5 分钟),请尝试增大超时时间或跳过资源反编译!")
return None
except Exception as e:
print(f"❌ jadx 反编译异常: {e}")
return None
def main():
"""示例使用函数"""
# ⚠️ 请替换为真实的 APK 路径
APK_PATH = "test.apk"
APKDecompiler.decompile_apk_with_jadx(APK_PATH, skip_resources=False)
if __name__ == "__main__":
main()
4. Lightweight security scan: quickly identify high-risk permissions and debugging/backup flags
Malicious APKs often start with low-threshold but high-risk points such as high-risk permissions and debug/backup flags. We can use Python to quickly implement a simplified version of the security scanner - although the method of searching the binary manifest by keywords is not 100% rigorous, it is sufficient for preliminary quick screening and is especially suitable for security introductory exercises.
# apk_security_scanner.py
import zipfile
import re
from typing import Dict, List
from pathlib import Path
from apk_analyzer import APKAnalyzer # 复用之前的轻量解析器
class APKSecurityScanner:
"""轻量 APK 安全扫描器:初步筛选高危权限和敏感配置"""
def __init__(self, apk_path: str):
self.apk_path = Path(apk_path)
self.analyzer = APKAnalyzer(apk_path)
self.issue_list = []
def scan(self) -> Dict:
"""执行完整的初步安全扫描流程"""
self._scan_dangerous_permissions()
self._scan_sensitive_manifest_flags()
return self._generate_scan_report()
def _scan_dangerous_permissions(self):
"""
初步扫描 AndroidManifest.xml 中的高危权限
⚠️ 注意:这里用的是关键词搜索二进制 Manifest,不够严谨
👉 如需 100% 准确解析,请使用 androguard / axmlparser 库
"""
with zipfile.ZipFile(self.apk_path, 'r') as zip_apk:
try:
# 读取二进制 Manifest 并用 latin-1 解码(避免中文乱码 / 解码错误)
manifest_bin = zip_apk.read("AndroidManifest.xml").decode("latin-1")
# 定义常见的 Android 高危权限
dangerous_permission_keywords = [
"CAMERA", "RECORD_AUDIO", "ACCESS_FINE_LOCATION",
"READ_CONTACTS", "READ_SMS", "SEND_SMS", "READ_PHONE_STATE",
"WRITE_EXTERNAL_STORAGE", "READ_CALL_LOG", "CALL_PHONE"
]
# 遍历关键词搜索
for perm_keyword in dangerous_permission_keywords:
if perm_keyword in manifest_bin:
self.issue_list.append({
"risk_level": "high",
"issue_type": "dangerous_permission",
"description": f"应用可能请求高危权限: android.permission.{perm_keyword}"
})
except Exception as e:
print(f"⚠️ 高危权限扫描跳过: {e}")
def _scan_sensitive_manifest_flags(self):
"""
初步扫描 AndroidManifest.xml 中的敏感配置标志
⚠️ 同样用关键词搜索,注意 false positive(误报)
"""
with zipfile.ZipFile(self.apk_path, 'r') as zip_apk:
try:
manifest_bin = zip_apk.read("AndroidManifest.xml").decode("latin-1")
# 扫描调试模式标志(debuggable=true)
if re.search(r"debuggable.*true", manifest_bin, re.IGNORECASE):
self.issue_list.append({
"risk_level": "critical",
"issue_type": "debug_mode_enabled",
"description": "应用可能启用了调试模式,恶意攻击者可利用此获取应用内部数据"
})
# 扫描允许备份标志(allowBackup=true)
if re.search(r"allowBackup.*true", manifest_bin, re.IGNORECASE):
self.issue_list.append({
"risk_level": "medium",
"issue_type": "allow_backup_enabled",
"description": "应用可能允许通过 adb 备份数据,存在数据泄露风险"
})
except Exception as e:
print(f"⚠️ 敏感配置扫描跳过: {e}")
def _generate_scan_report(self) -> Dict:
"""生成可读性强的扫描报告"""
risk_level_count = {"critical": 0, "high": 0, "medium": 0, "low": 0}
for issue in self.issue_list:
risk_level_count[issue["risk_level"]] += 1
return {
"apk_path": str(self.apk_path.resolve()),
"total_issues_found": len(self.issue_list),
"risk_level_statistics": risk_level_count,
"detailed_issues": self.issue_list
}
def main():
"""示例使用函数"""
# ⚠️ 请替换为真实的 APK 路径
APK_PATH = "test.apk"
try:
print(f"🔍 开始扫描 APK: {APK_PATH}...")
scanner = APKSecurityScanner(APK_PATH)
scan_report = scanner.scan()
print("\n" + "="*30 + " 轻量安全扫描报告 " + "="*30)
print(f"🔴 严重问题: {scan_report['risk_level_statistics']['critical']}")
print(f"🟠 高危问题: {scan_report['risk_level_statistics']['high']}")
print(f"🟡 中危问题: {scan_report['risk_level_statistics']['medium']}")
print(f"🔵 低危问题: {scan_report['risk_level_statistics']['low']}")
print(f"📋 总问题数: {scan_report['total_issues_found']}")
if scan_report["detailed_issues"]:
print("\n📝 详细问题列表:")
for i, issue in enumerate(scan_report["detailed_issues"], 1):
# 给不同风险等级加对应 emoji
risk_emoji = {
"critical": "🔴",
"high": "🟠",
"medium": "🟡",
"low": "🔵"
}[issue["risk_level"]]
print(f"{i}. {risk_emoji} [{issue['issue_type']}] {issue['description']}")
except Exception as e:
print(f"❌ 安全扫描失败: {e}")
if __name__ == "__main__":
main()
Summarize
Today we completed a complete introduction to APK from infrastructure understanding to lightweight tool implementation. If you want to continue in-depth Android reverse engineering and security analysis, you can explore in the following directions:
- Use
androguardoraxmlparserReplaces the simplified version of keyword search to achieve 100% accurate AndroidManifest.xml parsing
- With the help of
lief、radare2Use other tools to analyze the underlying logic of native SO libraries
- Learn jadx plug-in development and write customized code audit rules
- In-depth study of Android’s signature mechanism (V1/V2/V3/V4) and anti-tampering technology
- Get exposed to Frida dynamic Hook technology and analyze the runtime behavior of the application
(Full text, about 2600 words)