Spanicin commited on
Commit
6764da3
1 Parent(s): 04c1e71

Upload 11 files

Browse files
Files changed (11) hide show
  1. .gitignore +168 -0
  2. LICENSE +21 -0
  3. README.md +266 -10
  4. app.py +374 -0
  5. cog.yaml +35 -0
  6. inference.py +159 -0
  7. launcher.py +197 -0
  8. predict.py +214 -0
  9. quick_demo.ipynb +208 -0
  10. requirements3d.txt +21 -0
  11. webui.bat +17 -0
.gitignore ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ examples/results/*
163
+ gfpgan/*
164
+ checkpoints/
165
+ results/*
166
+ Dockerfile
167
+ start_docker.sh
168
+ start.sh
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Tencent AI Lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,10 +1,266 @@
1
- ---
2
- title: Aiavatar
3
- emoji: 👁
4
- colorFrom: red
5
- colorTo: gray
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ <img src='https://user-images.githubusercontent.com/4397546/229094115-862c747e-7397-4b54-ba4a-bd368bfe2e0f.png' width='500px'/>
4
+
5
+
6
+ <!--<h2> 😭 SadTalker: <span style="font-size:12px">Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation </span> </h2> -->
7
+
8
+ <a href='https://arxiv.org/abs/2211.12194'><img src='https://img.shields.io/badge/ArXiv-PDF-red'></a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<a href='https://sadtalker.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb) &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker) &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [![Replicate](https://replicate.com/cjwbw/sadtalker/badge)](https://replicate.com/cjwbw/sadtalker)
9
+
10
+ <div>
11
+ <a target='_blank'>Wenxuan Zhang <sup>*,1,2</sup> </a>&emsp;
12
+ <a href='https://vinthony.github.io/' target='_blank'>Xiaodong Cun <sup>*,2</a>&emsp;
13
+ <a href='https://xuanwangvc.github.io/' target='_blank'>Xuan Wang <sup>3</sup></a>&emsp;
14
+ <a href='https://yzhang2016.github.io/' target='_blank'>Yong Zhang <sup>2</sup></a>&emsp;
15
+ <a href='https://xishen0220.github.io/' target='_blank'>Xi Shen <sup>2</sup></a>&emsp; </br>
16
+ <a href='https://yuguo-xjtu.github.io/' target='_blank'>Yu Guo<sup>1</sup> </a>&emsp;
17
+ <a href='https://scholar.google.com/citations?hl=zh-CN&user=4oXBp9UAAAAJ' target='_blank'>Ying Shan <sup>2</sup> </a>&emsp;
18
+ <a target='_blank'>Fei Wang <sup>1</sup> </a>&emsp;
19
+ </div>
20
+ <br>
21
+ <div>
22
+ <sup>1</sup> Xi'an Jiaotong University &emsp; <sup>2</sup> Tencent AI Lab &emsp; <sup>3</sup> Ant Group &emsp;
23
+ </div>
24
+ <br>
25
+ <i><strong><a href='https://arxiv.org/abs/2211.12194' target='_blank'>CVPR 2023</a></strong></i>
26
+ <br>
27
+ <br>
28
+
29
+
30
+
31
+
32
+
33
+ ![sadtalker](https://user-images.githubusercontent.com/4397546/222490039-b1f6156b-bf00-405b-9fda-0c9a9156f991.gif)
34
+
35
+ <b>TL;DR: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; single portrait image 🙎‍♂️ &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;+ &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; audio 🎤 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; = &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; talking head video 🎞.</b>
36
+
37
+ <br>
38
+
39
+ </div>
40
+
41
+
42
+
43
+ ## 🔥 Highlight
44
+
45
+ - 🔥 The extension of the [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) is online. Checkout more details [here](docs/webui_extension.md).
46
+
47
+ https://user-images.githubusercontent.com/4397546/231495639-5d4bb925-ea64-4a36-a519-6389917dac29.mp4
48
+
49
+ - 🔥 `full image mode` is online! checkout [here](https://github.com/Winfredy/SadTalker#full-bodyimage-generation) for more details.
50
+
51
+ | still+enhancer in v0.0.1 | still + enhancer in v0.0.2 | [input image @bagbag1815](https://twitter.com/bagbag1815/status/1642754319094108161) |
52
+ |:--------------------: |:--------------------: | :----: |
53
+ | <video src="https://user-images.githubusercontent.com/48216707/229484996-5d7be64f-2553-4c9e-a452-c5cf0b8ebafe.mp4" type="video/mp4"> </video> | <video src="https://user-images.githubusercontent.com/4397546/230717873-355b7bf3-d3de-49f9-a439-9220e623fce7.mp4" type="video/mp4"> </video> | <img src='./examples/source_image/full_body_2.png' width='380'>
54
+
55
+ - 🔥 Several new mode, eg, `still mode`, `reference mode`, `resize mode` are online for better and custom applications.
56
+
57
+ - 🔥 Happy to see more community demos at [bilibili](https://search.bilibili.com/all?keyword=sadtalker&from_source=webtop_search&spm_id_from=333.1007&search_source=3
58
+ ), [Youtube](https://www.youtube.com/results?search_query=sadtalker&sp=CAM%253D) and [twitter #sadtalker](https://twitter.com/search?q=%23sadtalker&src=typed_query).
59
+
60
+ ## 📋 Changelog (Previous changelog can be founded [here](docs/changlelog.md))
61
+
62
+ - __[2023.04.15]__: Adding automatic1111 colab by @camenduru, thanks for this awesome colab: [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb).
63
+
64
+ - __[2023.04.12]__: adding a more detailed sd-webui installation document, fixed reinstallation problem.
65
+
66
+ - __[2023.04.12]__: Fixed the sd-webui safe issues becasue of the 3rd packages, optimize the output path in `sd-webui-extension`.
67
+
68
+ - __[2023.04.08]__: ❗️❗️❗️ In v0.0.2, we add a logo watermark to the generated video to prevent abusing since it is very realistic.
69
+
70
+ - __[2023.04.08]__: v0.0.2, full image animation, adding baidu driver for download checkpoints. Optimizing the logic about enhancer.
71
+
72
+
73
+ ## 🚧 TODO
74
+
75
+ <details><summary> Previous TODOs </summary>
76
+
77
+ - [x] Generating 2D face from a single Image.
78
+ - [x] Generating 3D face from Audio.
79
+ - [x] Generating 4D free-view talking examples from audio and a single image.
80
+ - [x] Gradio/Colab Demo.
81
+ - [x] Full body/image Generation.
82
+ - [x] integrade with stable-diffusion-web-ui. (stay tunning!)
83
+ </details>
84
+
85
+
86
+ - [ ] Audio-driven Anime Avatar.
87
+ - [ ] training code of each componments.
88
+
89
+
90
+ ## If you have any problem, please view our [FAQ](docs/FAQ.md) before opening an issue.
91
+
92
+ ## ⚙️ 1. Installation.
93
+
94
+ Tutorials from communities: [中文windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/) | [日本語コース](https://br-d.fanbox.cc/posts/5685086?utm_campaign=manage_post_page&utm_medium=share&utm_source=twitter)
95
+
96
+ ### Linux:
97
+
98
+ 1. Installing [anaconda](https://www.anaconda.com/), python and git.
99
+
100
+ 2. Creating the env and install the requirements.
101
+ ```bash
102
+ git clone https://github.com/Winfredy/SadTalker.git
103
+
104
+ cd SadTalker
105
+
106
+ conda create -n sadtalker python=3.8
107
+
108
+ conda activate sadtalker
109
+
110
+ pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
111
+
112
+ conda install ffmpeg
113
+
114
+ pip install -r requirements.txt
115
+
116
+ ### tts is optional for gradio demo.
117
+ ### pip install TTS
118
+
119
+ ```
120
+ ### Windows ([中文windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/)):
121
+
122
+ 1. Install [Python 3.10.6](https://www.python.org/downloads/windows/), checking "Add Python to PATH".
123
+ 2. Install [git](https://git-scm.com/download/win) manually (OR `scoop install git` via [scoop](https://scoop.sh/)).
124
+ 3. Install `ffmpeg`, following [this instruction](https://www.wikihow.com/Install-FFmpeg-on-Windows) (OR using `scoop install ffmpeg` via [scoop](https://scoop.sh/)).
125
+ 4. Download our SadTalker repository, for example by running `git clone https://github.com/Winfredy/SadTalker.git`.
126
+ 5. Download the `checkpoint` and `gfpgan` [below↓](https://github.com/Winfredy/SadTalker#-2-download-trained-models).
127
+ 5. Run `start.bat` from Windows Explorer as normal, non-administrator, user, a gradio WebUI demo will be started.
128
+
129
+ ### Macbook:
130
+
131
+ More tips about installnation on Macbook and the Docker file can be founded [here](docs/install.md)
132
+
133
+ ## 📥 2. Download Trained Models.
134
+
135
+ You can run the following script to put all the models in the right place.
136
+
137
+ ```bash
138
+ bash scripts/download_models.sh
139
+ ```
140
+
141
+ Other alternatives:
142
+ > we also provide an offline patch (`gfpgan/`), thus, no model will be downloaded when generating.
143
+
144
+ **Google Driver**: download our pre-trained model from [ this link (main checkpoints)](https://drive.google.com/drive/folders/1Wd88VDoLhVzYsQ30_qDVluQr_Xm46yHT?usp=sharing) and [ gfpgan (offline patch)](https://drive.google.com/file/d/19AIBsmfcHW6BRJmeqSFlG5fL445Xmsyi?usp=sharing)
145
+
146
+ **Github Release Page**: download all the files from the [lastest github release page](https://github.com/Winfredy/SadTalker/releases), and then, put it in ./checkpoints.
147
+
148
+ **百度云盘**: we provided the downloaded model in [checkpoints, 提取码: sadt.](https://pan.baidu.com/s/1nXuVNd0exUl37ISwWqbFGA?pwd=sadt) And [gfpgan, 提取码: sadt.](https://pan.baidu.com/s/1kb1BCPaLOWX1JJb9Czbn6w?pwd=sadt)
149
+
150
+
151
+
152
+ <details><summary>Model Details</summary>
153
+
154
+ The final folder will be shown as:
155
+
156
+ <img width="331" alt="image" src="https://user-images.githubusercontent.com/4397546/232511411-4ca75cbf-a434-48c5-9ae0-9009e8316484.png">
157
+
158
+
159
+ Model explains:
160
+
161
+ | Model | Description
162
+ | :--- | :----------
163
+ |checkpoints/auido2exp_00300-model.pth | Pre-trained ExpNet in Sadtalker.
164
+ |checkpoints/auido2pose_00140-model.pth | Pre-trained PoseVAE in Sadtalker.
165
+ |checkpoints/mapping_00229-model.pth.tar | Pre-trained MappingNet in Sadtalker.
166
+ |checkpoints/mapping_00109-model.pth.tar | Pre-trained MappingNet in Sadtalker.
167
+ |checkpoints/facevid2vid_00189-model.pth.tar | Pre-trained face-vid2vid model from [the reappearance of face-vid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis).
168
+ |checkpoints/epoch_20.pth | Pre-trained 3DMM extractor in [Deep3DFaceReconstruction](https://github.com/microsoft/Deep3DFaceReconstruction).
169
+ |checkpoints/wav2lip.pth | Highly accurate lip-sync model in [Wav2lip](https://github.com/Rudrabha/Wav2Lip).
170
+ |checkpoints/shape_predictor_68_face_landmarks.dat | Face landmark model used in [dilb](http://dlib.net/).
171
+ |checkpoints/BFM | 3DMM library file.
172
+ |checkpoints/hub | Face detection models used in [face alignment](https://github.com/1adrianb/face-alignment).
173
+ |gfpgan/weights | Face detection and enhanced models used in `facexlib` and `gfpgan`.
174
+
175
+
176
+ </details>
177
+
178
+ ## 🔮 3. Quick Start ([Best Practice](docs/best_practice.md)).
179
+
180
+ ### WebUI Demos:
181
+
182
+ **Online**: [Huggingface](https://huggingface.co/spaces/vinthony/SadTalker) | [SDWebUI-Colab](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) | [Colab](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb)
183
+
184
+ **Local Autiomatic1111 stable-diffusion webui extension**: please refer to [Autiomatic1111 stable-diffusion webui docs](docs/webui_extension.md).
185
+
186
+ **Local gradio demo**: Similar to our [hugging-face demo](https://huggingface.co/spaces/vinthony/SadTalker) can be run by:
187
+
188
+ ```bash
189
+ ## you need manually install TTS(https://github.com/coqui-ai/TTS) via `pip install tts` in advanced.
190
+ python app.py
191
+ ```
192
+
193
+ **Local windows gradio demo**: just double click `webui.bat`, the requirements will be installed automatically.
194
+
195
+
196
+ ### Manually usages:
197
+
198
+ ##### Animating a portrait image from default config:
199
+ ```bash
200
+ python inference.py --driven_audio <audio.wav> \
201
+ --source_image <video.mp4 or picture.png> \
202
+ --enhancer gfpgan
203
+ ```
204
+ The results will be saved in `results/$SOME_TIMESTAMP/*.mp4`.
205
+
206
+ ##### Full body/image Generation:
207
+
208
+ Using `--still` to generate a natural full body video. You can add `enhancer` to improve the quality of the generated video.
209
+
210
+ ```bash
211
+ python inference.py --driven_audio <audio.wav> \
212
+ --source_image <video.mp4 or picture.png> \
213
+ --result_dir <a file to store results> \
214
+ --still \
215
+ --preprocess full \
216
+ --enhancer gfpgan
217
+ ```
218
+
219
+ More examples and configuration and tips can be founded in the [ >>> best practice documents <<<](docs/best_practice.md).
220
+
221
+ ## 🛎 Citation
222
+
223
+ If you find our work useful in your research, please consider citing:
224
+
225
+ ```bibtex
226
+ @article{zhang2022sadtalker,
227
+ title={SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation},
228
+ author={Zhang, Wenxuan and Cun, Xiaodong and Wang, Xuan and Zhang, Yong and Shen, Xi and Guo, Yu and Shan, Ying and Wang, Fei},
229
+ journal={arXiv preprint arXiv:2211.12194},
230
+ year={2022}
231
+ }
232
+ ```
233
+
234
+
235
+
236
+ ## 💗 Acknowledgements
237
+
238
+ Facerender code borrows heavily from [zhanglonghao's reproduction of face-vid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis) and [PIRender](https://github.com/RenYurui/PIRender). We thank the authors for sharing their wonderful code. In training process, We also use the model from [Deep3DFaceReconstruction](https://github.com/microsoft/Deep3DFaceReconstruction) and [Wav2lip](https://github.com/Rudrabha/Wav2Lip). We thank for their wonderful work.
239
+
240
+ See also these wonderful 3rd libraries we use:
241
+
242
+ - **Face Utils**: https://github.com/xinntao/facexlib
243
+ - **Face Enhancement**: https://github.com/TencentARC/GFPGAN
244
+ - **Image/Video Enhancement**:https://github.com/xinntao/Real-ESRGAN
245
+
246
+ ## 🥂 Extensions:
247
+
248
+ - [SadTalker-Video-Lip-Sync](https://github.com/Zz-ww/SadTalker-Video-Lip-Sync) from [@Zz-ww](https://github.com/Zz-ww): SadTalker for Video Lip Editing
249
+
250
+ ## 🥂 Related Works
251
+ - [StyleHEAT: One-Shot High-Resolution Editable Talking Face Generation via Pre-trained StyleGAN (ECCV 2022)](https://github.com/FeiiYin/StyleHEAT)
252
+ - [CodeTalker: Speech-Driven 3D Facial Animation with Discrete Motion Prior (CVPR 2023)](https://github.com/Doubiiu/CodeTalker)
253
+ - [VideoReTalking: Audio-based Lip Synchronization for Talking Head Video Editing In the Wild (SIGGRAPH Asia 2022)](https://github.com/vinthony/video-retalking)
254
+ - [DPE: Disentanglement of Pose and Expression for General Video Portrait Editing (CVPR 2023)](https://github.com/Carlyx/DPE)
255
+ - [3D GAN Inversion with Facial Symmetry Prior (CVPR 2023)](https://github.com/FeiiYin/SPI/)
256
+ - [T2M-GPT: Generating Human Motion from Textual Descriptions with Discrete Representations (CVPR 2023)](https://github.com/Mael-zys/T2M-GPT)
257
+
258
+ ## 📢 Disclaimer
259
+
260
+ This is not an official product of Tencent. This repository can only be used for personal/research/non-commercial purposes.
261
+
262
+ LOGO: color and font suggestion: [ChatGPT](ai.com), logo font:[Montserrat Alternates
263
+ ](https://fonts.google.com/specimen/Montserrat+Alternates?preview.text=SadTalker&preview.text_type=custom&query=mont).
264
+
265
+ All the copyright of the demo images and audio are from communities users or the geneartion from stable diffusion. Free free to contact us if you feel uncomfortable.
266
+
app.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import torch
3
+ import shutil
4
+ import os
5
+ import sys
6
+ from argparse import ArgumentParser
7
+ from time import strftime
8
+ from argparse import Namespace
9
+ from src.utils.preprocess import CropAndExtract
10
+ from src.test_audio2coeff import Audio2Coeff
11
+ from src.facerender.animate import AnimateFromCoeff
12
+ from src.generate_batch import get_data
13
+ from src.generate_facerender_batch import get_facerender_data
14
+ # from src.utils.init_path import init_path
15
+ import tempfile
16
+ from openai import OpenAI
17
+ import threading
18
+ import elevenlabs
19
+ from elevenlabs import set_api_key, generate, play, clone
20
+ # from flask_cors import CORS, cross_origin
21
+ # from flask_swagger_ui import get_swaggerui_blueprint
22
+ import uuid
23
+ import time
24
+
25
+ start_time = time.time()
26
+
27
+ class AnimationConfig:
28
+ def __init__(self, driven_audio_path, source_image_path, result_folder,pose_style,expression_scale,enhancer,still,preprocess,ref_pose_video_path):
29
+ self.driven_audio = driven_audio_path
30
+ self.source_image = source_image_path
31
+ self.ref_eyeblink = ref_pose_video_path
32
+ self.ref_pose = ref_pose_video_path
33
+ self.checkpoint_dir = './checkpoints'
34
+ self.result_dir = result_folder
35
+ self.pose_style = pose_style
36
+ self.batch_size = 2
37
+ self.expression_scale = expression_scale
38
+ self.input_yaw = None
39
+ self.input_pitch = None
40
+ self.input_roll = None
41
+ self.enhancer = enhancer
42
+ self.background_enhancer = None
43
+ self.cpu = False
44
+ self.face3dvis = False
45
+ self.still = still
46
+ self.preprocess = preprocess
47
+ self.verbose = False
48
+ self.old_version = False
49
+ self.net_recon = 'resnet50'
50
+ self.init_path = None
51
+ self.use_last_fc = False
52
+ self.bfm_folder = './checkpoints/BFM_Fitting/'
53
+ self.bfm_model = 'BFM_model_front.mat'
54
+ self.focal = 1015.
55
+ self.center = 112.
56
+ self.camera_d = 10.
57
+ self.z_near = 5.
58
+ self.z_far = 15.
59
+ self.device = 'cpu'
60
+
61
+
62
+ app = Flask(__name__)
63
+
64
+ TEMP_DIR = None
65
+
66
+ app.config['temp_response'] = None
67
+ app.config['generation_thread'] = None
68
+ app.config['text_prompt'] = None
69
+ app.config['final_video_path'] = None
70
+
71
+
72
+
73
+ def main(args):
74
+ pic_path = args.source_image
75
+ audio_path = args.driven_audio
76
+ save_dir = args.result_dir
77
+ pose_style = args.pose_style
78
+ device = args.device
79
+ batch_size = args.batch_size
80
+ input_yaw_list = args.input_yaw
81
+ input_pitch_list = args.input_pitch
82
+ input_roll_list = args.input_roll
83
+ ref_eyeblink = args.ref_eyeblink
84
+ ref_pose = args.ref_pose
85
+ preprocess = args.preprocess
86
+
87
+ dir_path = os.path.dirname(os.path.realpath(__file__))
88
+ current_root_path = dir_path
89
+ print('current_root_path ',current_root_path)
90
+
91
+ # sadtalker_paths = init_path(args.checkpoint_dir, os.path.join(current_root_path, 'src/config'), args.size, args.old_version, args.preprocess)
92
+
93
+ path_of_lm_croper = os.path.join(current_root_path, args.checkpoint_dir, 'shape_predictor_68_face_landmarks.dat')
94
+ path_of_net_recon_model = os.path.join(current_root_path, args.checkpoint_dir, 'epoch_20.pth')
95
+ dir_of_BFM_fitting = os.path.join(current_root_path, args.checkpoint_dir, 'BFM_Fitting/BFM_Fitting')
96
+ wav2lip_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'wav2lip.pth')
97
+
98
+ audio2pose_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2pose_00140-model.pth')
99
+ audio2pose_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2pose.yaml')
100
+
101
+ audio2exp_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2exp_00300-model.pth')
102
+ audio2exp_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2exp.yaml')
103
+
104
+ free_view_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'facevid2vid_00189-model.pth.tar')
105
+
106
+ if preprocess == 'full':
107
+ mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00109-model.pth.tar')
108
+ facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender_still.yaml')
109
+ else:
110
+ mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00229-model.pth.tar')
111
+ facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender.yaml')
112
+
113
+ # preprocess_model = CropAndExtract(sadtalker_paths, device)
114
+ #init model
115
+ print(path_of_net_recon_model)
116
+ preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
117
+
118
+ # audio_to_coeff = Audio2Coeff(sadtalker_paths, device)
119
+ audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
120
+ audio2exp_checkpoint, audio2exp_yaml_path,
121
+ wav2lip_checkpoint, device)
122
+ # animate_from_coeff = AnimateFromCoeff(sadtalker_paths, device)
123
+ animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
124
+ facerender_yaml_path, device)
125
+
126
+ first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
127
+ os.makedirs(first_frame_dir, exist_ok=True)
128
+ # first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess,\
129
+ # source_image_flag=True, pic_size=args.size)
130
+
131
+
132
+ first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess, source_image_flag=True)
133
+ print('first_coeff_path ',first_coeff_path)
134
+ print('crop_pic_path ',crop_pic_path)
135
+
136
+ if first_coeff_path is None:
137
+ print("Can't get the coeffs of the input")
138
+ return
139
+
140
+ if ref_eyeblink is not None:
141
+ ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[0]
142
+ ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname)
143
+ os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
144
+ # ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir, args.preprocess, source_image_flag=False)
145
+ ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir)
146
+ else:
147
+ ref_eyeblink_coeff_path=None
148
+ print('ref_eyeblink_coeff_path',ref_eyeblink_coeff_path)
149
+
150
+ if ref_pose is not None:
151
+ if ref_pose == ref_eyeblink:
152
+ ref_pose_coeff_path = ref_eyeblink_coeff_path
153
+ else:
154
+ ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
155
+ ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname)
156
+ os.makedirs(ref_pose_frame_dir, exist_ok=True)
157
+ # ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir, args.preprocess, source_image_flag=False)
158
+ ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir)
159
+ else:
160
+ ref_pose_coeff_path=None
161
+ print('ref_eyeblink_coeff_path',ref_pose_coeff_path)
162
+
163
+ batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still)
164
+ coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
165
+
166
+
167
+ if args.face3dvis:
168
+ from src.face3d.visualize import gen_composed_video
169
+ gen_composed_video(args, device, first_coeff_path, coeff_path, audio_path, os.path.join(save_dir, '3dface.mp4'))
170
+
171
+ # data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
172
+ # batch_size, input_yaw_list, input_pitch_list, input_roll_list,
173
+ # expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess, size=args.size)
174
+
175
+
176
+ data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
177
+ batch_size, input_yaw_list, input_pitch_list, input_roll_list,
178
+ expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess)
179
+
180
+ # result, base64_video,temp_file_path= animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
181
+ # enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess, img_size=args.size)
182
+
183
+
184
+ result, base64_video,temp_file_path = animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
185
+ enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess)
186
+
187
+ print('The generated video is named:')
188
+ app.config['temp_response'] = base64_video
189
+ app.config['final_video_path'] = temp_file_path
190
+ return base64_video, temp_file_path
191
+
192
+ # shutil.move(result, save_dir+'.mp4')
193
+
194
+
195
+ if not args.verbose:
196
+ shutil.rmtree(save_dir)
197
+
198
+ def create_temp_dir():
199
+ return tempfile.TemporaryDirectory()
200
+
201
+ def save_uploaded_file(file, filename,TEMP_DIR):
202
+ unique_filename = str(uuid.uuid4()) + "_" + filename
203
+ file_path = os.path.join(TEMP_DIR.name, unique_filename)
204
+ file.save(file_path)
205
+ return file_path
206
+
207
+ client = OpenAI(api_key="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")
208
+
209
+ def translate_text(text_prompt, target_language):
210
+ response = client.chat.completions.create(
211
+ model="gpt-4-0125-preview",
212
+ messages=[{"role": "system", "content": "You are a helpful language translator assistant."},
213
+ {"role": "user", "content": f"Translate completely without hallucination, end to end, and give the following text to {target_language} language and the text is: {text_prompt}"},
214
+ ],
215
+ max_tokens = len(text_prompt) + 200 # Use the length of the input text
216
+ # temperature=0.3,
217
+ # stop=["Translate:", "Text:"]
218
+ )
219
+ return response
220
+
221
+
222
+
223
+ @app.route("/run", methods=['POST'])
224
+ async def generate_video():
225
+ global TEMP_DIR
226
+ TEMP_DIR = create_temp_dir()
227
+ if request.method == 'POST':
228
+ source_image = request.files['source_image']
229
+ text_prompt = request.form['text_prompt']
230
+ print('Input text prompt: ',text_prompt)
231
+ voice_cloning = request.form.get('voice_cloning', 'no')
232
+ target_language = request.form.get('target_language', 'original_text')
233
+ print('target_language',target_language)
234
+ pose_style = int(request.form.get('pose_style', 1))
235
+ expression_scale = int(request.form.get('expression_scale', 1))
236
+ enhancer = request.form.get('enhancer', None)
237
+ voice_gender = request.form.get('voice_gender', 'male')
238
+ still_str = request.form.get('still', 'False')
239
+ still = still_str.lower() == 'true'
240
+ print('still', still)
241
+ preprocess = request.form.get('preprocess', 'crop')
242
+ print('preprocess selected: ',preprocess)
243
+ ref_pose_video = request.files.get('ref_pose', None)
244
+
245
+ if target_language != 'original_text':
246
+ response = translate_text(text_prompt, target_language)
247
+ # response = await translate_text_async(text_prompt, target_language)
248
+ text_prompt = response.choices[0].message.content.strip()
249
+
250
+ app.config['text_prompt'] = text_prompt
251
+ print('Final text prompt: ',text_prompt)
252
+
253
+ source_image_path = save_uploaded_file(source_image, 'source_image.png',TEMP_DIR)
254
+ print(source_image_path)
255
+
256
+ # driven_audio_path = await voice_cloning_async(voice_cloning, voice_gender, text_prompt, user_voice)
257
+
258
+ if voice_cloning == 'no':
259
+ if voice_gender == 'male':
260
+ voice = 'onyx'
261
+ else:
262
+ voice = 'nova'
263
+
264
+ print('Entering Audio creation using whisper')
265
+ response = client.audio.speech.create(model="tts-1-hd",
266
+ voice=voice,
267
+ input = text_prompt)
268
+
269
+ print('Audio created using whisper')
270
+ with tempfile.NamedTemporaryFile(suffix=".wav", prefix="text_to_speech_",dir=TEMP_DIR.name, delete=False) as temp_file:
271
+ driven_audio_path = temp_file.name
272
+
273
+ response.write_to_file(driven_audio_path)
274
+ print('Audio file saved')
275
+
276
+ elif voice_cloning == 'yes':
277
+ user_voice = request.files['user_voice']
278
+
279
+ with tempfile.NamedTemporaryFile(suffix=".wav", prefix="user_voice_",dir=TEMP_DIR.name, delete=False) as temp_file:
280
+ user_voice_path = temp_file.name
281
+ user_voice.save(user_voice_path)
282
+ print('user_voice_path',user_voice_path)
283
+
284
+ set_api_key("87792fce164425fbe1204e9fd1fe25cd")
285
+ voice = clone(name = "User Cloned Voice",
286
+ files = [user_voice_path] )
287
+
288
+ audio = generate(text = text_prompt, voice = voice, model = "eleven_multilingual_v2",stream=True, latency=4)
289
+ with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="cloned_audio_",dir=TEMP_DIR.name, delete=False) as temp_file:
290
+ for chunk in audio:
291
+ temp_file.write(chunk)
292
+ driven_audio_path = temp_file.name
293
+ print('driven_audio_path',driven_audio_path)
294
+
295
+ # elevenlabs.save(audio, driven_audio_path)
296
+
297
+ save_dir = tempfile.mkdtemp(dir=TEMP_DIR.name)
298
+ result_folder = os.path.join(save_dir, "results")
299
+ os.makedirs(result_folder, exist_ok=True)
300
+
301
+ ref_pose_video_path = None
302
+ if ref_pose_video:
303
+ with tempfile.NamedTemporaryFile(suffix=".mp4", prefix="ref_pose_",dir=TEMP_DIR.name, delete=False) as temp_file:
304
+ ref_pose_video_path = temp_file.name
305
+ ref_pose_video.save(ref_pose_video_path)
306
+ print('ref_pose_video_path',ref_pose_video_path)
307
+
308
+ # Example of using the class with some hypothetical paths
309
+ args = AnimationConfig(driven_audio_path=driven_audio_path, source_image_path=source_image_path, result_folder=result_folder, pose_style=pose_style, expression_scale=expression_scale, enhancer=enhancer,still=still,preprocess=preprocess,ref_pose_video_path=ref_pose_video_path)
310
+
311
+ if torch.cuda.is_available() and not args.cpu:
312
+ args.device = "cuda"
313
+ else:
314
+ args.device = "cpu"
315
+
316
+ generation_thread = threading.Thread(target=main, args=(args,))
317
+ app.config['generation_thread'] = generation_thread
318
+ generation_thread.start()
319
+ response_data = {"message": "Video generation started",
320
+ "process_id": generation_thread.ident}
321
+
322
+ return jsonify(response_data)
323
+ # base64_video = main(args)
324
+ # return jsonify({"base64_video": base64_video})
325
+
326
+ #else:
327
+ # return 'Unsupported HTTP method', 405
328
+
329
+ @app.route("/status", methods=["GET"])
330
+ def check_generation_status():
331
+ global TEMP_DIR
332
+ response = {"base64_video": "","text_prompt":"", "status": ""}
333
+ process_id = request.args.get('process_id', None)
334
+
335
+ # process_id is required to check the status for that specific process
336
+ if process_id:
337
+ generation_thread = app.config.get('generation_thread')
338
+ if generation_thread and generation_thread.ident == int(process_id) and generation_thread.is_alive():
339
+ return jsonify({"status": "in_progress"}), 200
340
+ elif app.config.get('temp_response'):
341
+ # app.config['temp_response']['status'] = 'completed'
342
+ final_response = app.config['temp_response']
343
+ response["base64_video"] = final_response
344
+ response["text_prompt"] = app.config.get('text_prompt')
345
+ response["status"] = "completed"
346
+
347
+ final_video_path = app.config['final_video_path']
348
+ print('final_video_path',final_video_path)
349
+
350
+
351
+ if final_video_path and os.path.exists(final_video_path):
352
+ os.remove(final_video_path)
353
+ print("Deleted video file:", final_video_path)
354
+
355
+ TEMP_DIR.cleanup()
356
+ # print("Temporary Directory:", TEMP_DIR.name)
357
+ # if TEMP_DIR:
358
+ # print("Contents of Temporary Directory:")
359
+ # for filename in os.listdir(TEMP_DIR.name):
360
+ # print(filename)
361
+ # else:
362
+ # print("Temporary Directory is None or already cleaned up.")
363
+ end_time = time.time()
364
+ total_time = round(end_time - start_time, 2)
365
+ print("Total time taken for execution:", total_time, " seconds")
366
+ return jsonify(response)
367
+ return jsonify({"error":"No process id provided"})
368
+
369
+ @app.route("/health", methods=["GET"])
370
+ def health_status():
371
+ response = {"online": "true"}
372
+ return jsonify(response)
373
+ if __name__ == '__main__':
374
+ app.run(debug=True)
cog.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ build:
2
+ gpu: true
3
+ cuda: "11.3"
4
+ python_version: "3.8"
5
+ system_packages:
6
+ - "ffmpeg"
7
+ - "libgl1-mesa-glx"
8
+ - "libglib2.0-0"
9
+ python_packages:
10
+ - "torch==1.12.1"
11
+ - "torchvision==0.13.1"
12
+ - "torchaudio==0.12.1"
13
+ - "joblib==1.1.0"
14
+ - "scikit-image==0.19.3"
15
+ - "basicsr==1.4.2"
16
+ - "facexlib==0.3.0"
17
+ - "resampy==0.3.1"
18
+ - "pydub==0.25.1"
19
+ - "scipy==1.10.1"
20
+ - "kornia==0.6.8"
21
+ - "face_alignment==1.3.5"
22
+ - "imageio==2.19.3"
23
+ - "imageio-ffmpeg==0.4.7"
24
+ - "librosa==0.9.2" #
25
+ - "tqdm==4.65.0"
26
+ - "yacs==0.1.8"
27
+ - "gfpgan==1.3.8"
28
+ - "dlib-bin==19.24.1"
29
+ - "av==10.0.0"
30
+ - "trimesh==3.9.20"
31
+ run:
32
+ - mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/s3fd-619a316812.pth" "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth"
33
+ - mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/2DFAN4-cd938726ad.zip" "https://www.adrianbulat.com/downloads/python-fan/2DFAN4-cd938726ad.zip"
34
+
35
+ predict: "predict.py:Predictor"
inference.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from time import strftime
3
+ import os, sys, time
4
+ from argparse import ArgumentParser
5
+
6
+ from src.utils.preprocess import CropAndExtract
7
+ from src.test_audio2coeff import Audio2Coeff
8
+ from src.facerender.animate import AnimateFromCoeff
9
+ from src.generate_batch import get_data
10
+ from src.generate_facerender_batch import get_facerender_data
11
+
12
+ def main(args):
13
+ #torch.backends.cudnn.enabled = False
14
+
15
+ pic_path = args.source_image
16
+ audio_path = args.driven_audio
17
+ save_dir = os.path.join(args.result_dir, strftime("%Y_%m_%d_%H.%M.%S"))
18
+ os.makedirs(save_dir, exist_ok=True)
19
+ pose_style = args.pose_style
20
+ device = args.device
21
+ batch_size = args.batch_size
22
+ input_yaw_list = args.input_yaw
23
+ input_pitch_list = args.input_pitch
24
+ input_roll_list = args.input_roll
25
+ ref_eyeblink = args.ref_eyeblink
26
+ ref_pose = args.ref_pose
27
+
28
+ current_code_path = sys.argv[0]
29
+ current_root_path = os.path.split(current_code_path)[0]
30
+
31
+ os.environ['TORCH_HOME']=os.path.join(current_root_path, args.checkpoint_dir)
32
+
33
+ path_of_lm_croper = os.path.join(current_root_path, args.checkpoint_dir, 'shape_predictor_68_face_landmarks.dat')
34
+ path_of_net_recon_model = os.path.join(current_root_path, args.checkpoint_dir, 'epoch_20.pth')
35
+ dir_of_BFM_fitting = os.path.join(current_root_path, args.checkpoint_dir, 'BFM_Fitting')
36
+ wav2lip_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'wav2lip.pth')
37
+
38
+ audio2pose_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2pose_00140-model.pth')
39
+ audio2pose_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2pose.yaml')
40
+
41
+ audio2exp_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2exp_00300-model.pth')
42
+ audio2exp_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2exp.yaml')
43
+
44
+ free_view_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'facevid2vid_00189-model.pth.tar')
45
+
46
+ if args.preprocess == 'full':
47
+ mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00109-model.pth.tar')
48
+ facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender_still.yaml')
49
+ else:
50
+ mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00229-model.pth.tar')
51
+ facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender.yaml')
52
+
53
+ #init model
54
+ print(path_of_net_recon_model)
55
+ preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
56
+
57
+ print(audio2pose_checkpoint)
58
+ print(audio2exp_checkpoint)
59
+ audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
60
+ audio2exp_checkpoint, audio2exp_yaml_path,
61
+ wav2lip_checkpoint, device)
62
+
63
+ print(free_view_checkpoint)
64
+ print(mapping_checkpoint)
65
+ animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
66
+ facerender_yaml_path, device)
67
+
68
+ #crop image and extract 3dmm from image
69
+ first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
70
+ os.makedirs(first_frame_dir, exist_ok=True)
71
+ print('3DMM Extraction for source image')
72
+ first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess, source_image_flag=True)
73
+ if first_coeff_path is None:
74
+ print("Can't get the coeffs of the input")
75
+ return
76
+
77
+ if ref_eyeblink is not None:
78
+ ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[0]
79
+ ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname)
80
+ os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
81
+ print('3DMM Extraction for the reference video providing eye blinking')
82
+ ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir)
83
+ else:
84
+ ref_eyeblink_coeff_path=None
85
+
86
+ if ref_pose is not None:
87
+ if ref_pose == ref_eyeblink:
88
+ ref_pose_coeff_path = ref_eyeblink_coeff_path
89
+ else:
90
+ ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
91
+ ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname)
92
+ os.makedirs(ref_pose_frame_dir, exist_ok=True)
93
+ print('3DMM Extraction for the reference video providing pose')
94
+ ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir)
95
+ else:
96
+ ref_pose_coeff_path=None
97
+
98
+ #audio2ceoff
99
+ batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still)
100
+ coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
101
+
102
+ # 3dface render
103
+ if args.face3dvis:
104
+ from src.face3d.visualize import gen_composed_video
105
+ gen_composed_video(args, device, first_coeff_path, coeff_path, audio_path, os.path.join(save_dir, '3dface.mp4'))
106
+
107
+ #coeff2video
108
+ data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
109
+ batch_size, input_yaw_list, input_pitch_list, input_roll_list,
110
+ expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess)
111
+
112
+ animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
113
+ enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess)
114
+
115
+ if __name__ == '__main__':
116
+
117
+ parser = ArgumentParser()
118
+ parser.add_argument("--driven_audio", default='./examples/driven_audio/bus_chinese.wav', help="path to driven audio")
119
+ parser.add_argument("--source_image", default='./examples/source_image/full_body_2.png', help="path to source image")
120
+ parser.add_argument("--ref_eyeblink", default=None, help="path to reference video providing eye blinking")
121
+ parser.add_argument("--ref_pose", default=None, help="path to reference video providing pose")
122
+ parser.add_argument("--checkpoint_dir", default='./checkpoints', help="path to output")
123
+ parser.add_argument("--result_dir", default='./results', help="path to output")
124
+ parser.add_argument("--pose_style", type=int, default=0, help="input pose style from [0, 46)")
125
+ parser.add_argument("--batch_size", type=int, default=2, help="the batch size of facerender")
126
+ parser.add_argument("--expression_scale", type=float, default=1., help="the batch size of facerender")
127
+ parser.add_argument('--input_yaw', nargs='+', type=int, default=None, help="the input yaw degree of the user ")
128
+ parser.add_argument('--input_pitch', nargs='+', type=int, default=None, help="the input pitch degree of the user")
129
+ parser.add_argument('--input_roll', nargs='+', type=int, default=None, help="the input roll degree of the user")
130
+ parser.add_argument('--enhancer', type=str, default=None, help="Face enhancer, [gfpgan, RestoreFormer]")
131
+ parser.add_argument('--background_enhancer', type=str, default=None, help="background enhancer, [realesrgan]")
132
+ parser.add_argument("--cpu", dest="cpu", action="store_true")
133
+ parser.add_argument("--face3dvis", action="store_true", help="generate 3d face and 3d landmarks")
134
+ parser.add_argument("--still", action="store_true", help="can crop back to the original videos for the full body aniamtion")
135
+ parser.add_argument("--preprocess", default='crop', choices=['crop', 'resize', 'full'], help="how to preprocess the images" )
136
+
137
+ # net structure and parameters
138
+ parser.add_argument('--net_recon', type=str, default='resnet50', choices=['resnet18', 'resnet34', 'resnet50'], help='useless')
139
+ parser.add_argument('--init_path', type=str, default=None, help='Useless')
140
+ parser.add_argument('--use_last_fc',default=False, help='zero initialize the last fc')
141
+ parser.add_argument('--bfm_folder', type=str, default='./checkpoints/BFM_Fitting/')
142
+ parser.add_argument('--bfm_model', type=str, default='BFM_model_front.mat', help='bfm model')
143
+
144
+ # default renderer parameters
145
+ parser.add_argument('--focal', type=float, default=1015.)
146
+ parser.add_argument('--center', type=float, default=112.)
147
+ parser.add_argument('--camera_d', type=float, default=10.)
148
+ parser.add_argument('--z_near', type=float, default=5.)
149
+ parser.add_argument('--z_far', type=float, default=15.)
150
+
151
+ args = parser.parse_args()
152
+
153
+ if torch.cuda.is_available() and not args.cpu:
154
+ args.device = "cuda"
155
+ else:
156
+ args.device = "cpu"
157
+
158
+ main(args)
159
+
launcher.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this scripts installs necessary requirements and launches main program in webui.py
2
+ # borrow from : https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/launch.py
3
+ import subprocess
4
+ import os
5
+ import sys
6
+ import importlib.util
7
+ import shlex
8
+ import platform
9
+ import json
10
+
11
+ python = sys.executable
12
+ git = os.environ.get('GIT', "git")
13
+ index_url = os.environ.get('INDEX_URL', "")
14
+ stored_commit_hash = None
15
+ skip_install = False
16
+ dir_repos = "repositories"
17
+ script_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
18
+
19
+ if 'GRADIO_ANALYTICS_ENABLED' not in os.environ:
20
+ os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
21
+
22
+
23
+ def check_python_version():
24
+ is_windows = platform.system() == "Windows"
25
+ major = sys.version_info.major
26
+ minor = sys.version_info.minor
27
+ micro = sys.version_info.micro
28
+
29
+ if is_windows:
30
+ supported_minors = [10]
31
+ else:
32
+ supported_minors = [7, 8, 9, 10, 11]
33
+
34
+ if not (major == 3 and minor in supported_minors):
35
+
36
+ raise (f"""
37
+ INCOMPATIBLE PYTHON VERSION
38
+ This program is tested with 3.10.6 Python, but you have {major}.{minor}.{micro}.
39
+ If you encounter an error with "RuntimeError: Couldn't install torch." message,
40
+ or any other error regarding unsuccessful package (library) installation,
41
+ please downgrade (or upgrade) to the latest version of 3.10 Python
42
+ and delete current Python and "venv" folder in WebUI's directory.
43
+ You can download 3.10 Python from here: https://www.python.org/downloads/release/python-3109/
44
+ {"Alternatively, use a binary release of WebUI: https://github.com/AUTOMATIC1111/stable-diffusion-webui/releases" if is_windows else ""}
45
+ Use --skip-python-version-check to suppress this warning.
46
+ """)
47
+
48
+
49
+ def commit_hash():
50
+ global stored_commit_hash
51
+
52
+ if stored_commit_hash is not None:
53
+ return stored_commit_hash
54
+
55
+ try:
56
+ stored_commit_hash = run(f"{git} rev-parse HEAD").strip()
57
+ except Exception:
58
+ stored_commit_hash = "<none>"
59
+
60
+ return stored_commit_hash
61
+
62
+
63
+ def run(command, desc=None, errdesc=None, custom_env=None, live=False):
64
+ if desc is not None:
65
+ print(desc)
66
+
67
+ if live:
68
+ result = subprocess.run(command, shell=True, env=os.environ if custom_env is None else custom_env)
69
+ if result.returncode != 0:
70
+ raise RuntimeError(f"""{errdesc or 'Error running command'}.
71
+ Command: {command}
72
+ Error code: {result.returncode}""")
73
+
74
+ return ""
75
+
76
+ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=os.environ if custom_env is None else custom_env)
77
+
78
+ if result.returncode != 0:
79
+
80
+ message = f"""{errdesc or 'Error running command'}.
81
+ Command: {command}
82
+ Error code: {result.returncode}
83
+ stdout: {result.stdout.decode(encoding="utf8", errors="ignore") if len(result.stdout)>0 else '<empty>'}
84
+ stderr: {result.stderr.decode(encoding="utf8", errors="ignore") if len(result.stderr)>0 else '<empty>'}
85
+ """
86
+ raise RuntimeError(message)
87
+
88
+ return result.stdout.decode(encoding="utf8", errors="ignore")
89
+
90
+
91
+ def check_run(command):
92
+ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
93
+ return result.returncode == 0
94
+
95
+
96
+ def is_installed(package):
97
+ try:
98
+ spec = importlib.util.find_spec(package)
99
+ except ModuleNotFoundError:
100
+ return False
101
+
102
+ return spec is not None
103
+
104
+
105
+ def repo_dir(name):
106
+ return os.path.join(script_path, dir_repos, name)
107
+
108
+
109
+ def run_python(code, desc=None, errdesc=None):
110
+ return run(f'"{python}" -c "{code}"', desc, errdesc)
111
+
112
+
113
+ def run_pip(args, desc=None):
114
+ if skip_install:
115
+ return
116
+
117
+ index_url_line = f' --index-url {index_url}' if index_url != '' else ''
118
+ return run(f'"{python}" -m pip {args} --prefer-binary{index_url_line}', desc=f"Installing {desc}", errdesc=f"Couldn't install {desc}")
119
+
120
+
121
+ def check_run_python(code):
122
+ return check_run(f'"{python}" -c "{code}"')
123
+
124
+
125
+ def git_clone(url, dir, name, commithash=None):
126
+ # TODO clone into temporary dir and move if successful
127
+
128
+ if os.path.exists(dir):
129
+ if commithash is None:
130
+ return
131
+
132
+ current_hash = run(f'"{git}" -C "{dir}" rev-parse HEAD', None, f"Couldn't determine {name}'s hash: {commithash}").strip()
133
+ if current_hash == commithash:
134
+ return
135
+
136
+ run(f'"{git}" -C "{dir}" fetch', f"Fetching updates for {name}...", f"Couldn't fetch {name}")
137
+ run(f'"{git}" -C "{dir}" checkout {commithash}', f"Checking out commit for {name} with hash: {commithash}...", f"Couldn't checkout commit {commithash} for {name}")
138
+ return
139
+
140
+ run(f'"{git}" clone "{url}" "{dir}"', f"Cloning {name} into {dir}...", f"Couldn't clone {name}")
141
+
142
+ if commithash is not None:
143
+ run(f'"{git}" -C "{dir}" checkout {commithash}', None, "Couldn't checkout {name}'s hash: {commithash}")
144
+
145
+
146
+ def git_pull_recursive(dir):
147
+ for subdir, _, _ in os.walk(dir):
148
+ if os.path.exists(os.path.join(subdir, '.git')):
149
+ try:
150
+ output = subprocess.check_output([git, '-C', subdir, 'pull', '--autostash'])
151
+ print(f"Pulled changes for repository in '{subdir}':\n{output.decode('utf-8').strip()}\n")
152
+ except subprocess.CalledProcessError as e:
153
+ print(f"Couldn't perform 'git pull' on repository in '{subdir}':\n{e.output.decode('utf-8').strip()}\n")
154
+
155
+
156
+ def run_extension_installer(extension_dir):
157
+ path_installer = os.path.join(extension_dir, "install.py")
158
+ if not os.path.isfile(path_installer):
159
+ return
160
+
161
+ try:
162
+ env = os.environ.copy()
163
+ env['PYTHONPATH'] = os.path.abspath(".")
164
+
165
+ print(run(f'"{python}" "{path_installer}"', errdesc=f"Error running install.py for extension {extension_dir}", custom_env=env))
166
+ except Exception as e:
167
+ print(e, file=sys.stderr)
168
+
169
+
170
+ def prepare_environment():
171
+ global skip_install
172
+
173
+ torch_command = os.environ.get('TORCH_COMMAND', "pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117")
174
+ requirements_file = os.environ.get('REQS_FILE', "requirements.txt")
175
+
176
+ commit = commit_hash()
177
+
178
+ print(f"Python {sys.version}")
179
+ print(f"Commit hash: {commit}")
180
+
181
+ if not is_installed("torch") or not is_installed("torchvision"):
182
+ run(f'"{python}" -m {torch_command}', "Installing torch and torchvision", "Couldn't install torch", live=True)
183
+
184
+ run_python("import torch; assert torch.cuda.is_available(), 'Torch is not able to use GPU; add --skip-torch-cuda-test to COMMANDLINE_ARGS variable to disable this check'")
185
+
186
+ run_pip(f"install -r \"{requirements_file}\"", "requirements for SadTalker WebUI (may take longer time in first time)")
187
+
188
+
189
+ def start():
190
+ print(f"Launching SadTalker Web UI")
191
+ from app import sadtalker_demo
192
+ demo = sadtalker_demo()
193
+ demo.launch(share=True)
194
+
195
+ if __name__ == "__main__":
196
+ prepare_environment()
197
+ start()
predict.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """run bash scripts/download_models.sh first to prepare the weights file"""
2
+ import os
3
+ import shutil
4
+ from argparse import Namespace
5
+ from src.utils.preprocess import CropAndExtract
6
+ from src.test_audio2coeff import Audio2Coeff
7
+ from src.facerender.animate import AnimateFromCoeff
8
+ from src.generate_batch import get_data
9
+ from src.generate_facerender_batch import get_facerender_data
10
+ from cog import BasePredictor, Input, Path
11
+
12
+ checkpoints = "checkpoints"
13
+
14
+
15
+ class Predictor(BasePredictor):
16
+ def setup(self):
17
+ """Load the model into memory to make running multiple predictions efficient"""
18
+ device = "cuda"
19
+
20
+ path_of_lm_croper = os.path.join(
21
+ checkpoints, "shape_predictor_68_face_landmarks.dat"
22
+ )
23
+ path_of_net_recon_model = os.path.join(checkpoints, "epoch_20.pth")
24
+ dir_of_BFM_fitting = os.path.join(checkpoints, "BFM_Fitting")
25
+ wav2lip_checkpoint = os.path.join(checkpoints, "wav2lip.pth")
26
+
27
+ audio2pose_checkpoint = os.path.join(checkpoints, "auido2pose_00140-model.pth")
28
+ audio2pose_yaml_path = os.path.join("src", "config", "auido2pose.yaml")
29
+
30
+ audio2exp_checkpoint = os.path.join(checkpoints, "auido2exp_00300-model.pth")
31
+ audio2exp_yaml_path = os.path.join("src", "config", "auido2exp.yaml")
32
+
33
+ free_view_checkpoint = os.path.join(
34
+ checkpoints, "facevid2vid_00189-model.pth.tar"
35
+ )
36
+
37
+ # init model
38
+ self.preprocess_model = CropAndExtract(
39
+ path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device
40
+ )
41
+
42
+ self.audio_to_coeff = Audio2Coeff(
43
+ audio2pose_checkpoint,
44
+ audio2pose_yaml_path,
45
+ audio2exp_checkpoint,
46
+ audio2exp_yaml_path,
47
+ wav2lip_checkpoint,
48
+ device,
49
+ )
50
+
51
+ self.animate_from_coeff = {
52
+ "full": AnimateFromCoeff(
53
+ free_view_checkpoint,
54
+ os.path.join(checkpoints, "mapping_00109-model.pth.tar"),
55
+ os.path.join("src", "config", "facerender_still.yaml"),
56
+ device,
57
+ ),
58
+ "others": AnimateFromCoeff(
59
+ free_view_checkpoint,
60
+ os.path.join(checkpoints, "mapping_00229-model.pth.tar"),
61
+ os.path.join("src", "config", "facerender.yaml"),
62
+ device,
63
+ ),
64
+ }
65
+
66
+ def predict(
67
+ self,
68
+ source_image: Path = Input(
69
+ description="Upload the source image, it can be video.mp4 or picture.png",
70
+ ),
71
+ driven_audio: Path = Input(
72
+ description="Upload the driven audio, accepts .wav and .mp4 file",
73
+ ),
74
+ enhancer: str = Input(
75
+ description="Choose a face enhancer",
76
+ choices=["gfpgan", "RestoreFormer"],
77
+ default="gfpgan",
78
+ ),
79
+ preprocess: str = Input(
80
+ description="how to preprocess the images",
81
+ choices=["crop", "resize", "full"],
82
+ default="full",
83
+ ),
84
+ ref_eyeblink: Path = Input(
85
+ description="path to reference video providing eye blinking",
86
+ default=None,
87
+ ),
88
+ ref_pose: Path = Input(
89
+ description="path to reference video providing pose",
90
+ default=None,
91
+ ),
92
+ still: bool = Input(
93
+ description="can crop back to the original videos for the full body aniamtion when preprocess is full",
94
+ default=True,
95
+ ),
96
+ ) -> Path:
97
+ """Run a single prediction on the model"""
98
+
99
+ animate_from_coeff = (
100
+ self.animate_from_coeff["full"]
101
+ if preprocess == "full"
102
+ else self.animate_from_coeff["others"]
103
+ )
104
+
105
+ args = load_default()
106
+ args.pic_path = str(source_image)
107
+ args.audio_path = str(driven_audio)
108
+ device = "cuda"
109
+ args.still = still
110
+ args.ref_eyeblink = None if ref_eyeblink is None else str(ref_eyeblink)
111
+ args.ref_pose = None if ref_pose is None else str(ref_pose)
112
+
113
+ # crop image and extract 3dmm from image
114
+ results_dir = "results"
115
+ if os.path.exists(results_dir):
116
+ shutil.rmtree(results_dir)
117
+ os.makedirs(results_dir)
118
+ first_frame_dir = os.path.join(results_dir, "first_frame_dir")
119
+ os.makedirs(first_frame_dir)
120
+
121
+ print("3DMM Extraction for source image")
122
+ first_coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate(
123
+ args.pic_path, first_frame_dir, preprocess, source_image_flag=True
124
+ )
125
+ if first_coeff_path is None:
126
+ print("Can't get the coeffs of the input")
127
+ return
128
+
129
+ if ref_eyeblink is not None:
130
+ ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[
131
+ 0
132
+ ]
133
+ ref_eyeblink_frame_dir = os.path.join(results_dir, ref_eyeblink_videoname)
134
+ os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
135
+ print("3DMM Extraction for the reference video providing eye blinking")
136
+ ref_eyeblink_coeff_path, _, _ = self.preprocess_model.generate(
137
+ ref_eyeblink, ref_eyeblink_frame_dir
138
+ )
139
+ else:
140
+ ref_eyeblink_coeff_path = None
141
+
142
+ if ref_pose is not None:
143
+ if ref_pose == ref_eyeblink:
144
+ ref_pose_coeff_path = ref_eyeblink_coeff_path
145
+ else:
146
+ ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
147
+ ref_pose_frame_dir = os.path.join(results_dir, ref_pose_videoname)
148
+ os.makedirs(ref_pose_frame_dir, exist_ok=True)
149
+ print("3DMM Extraction for the reference video providing pose")
150
+ ref_pose_coeff_path, _, _ = self.preprocess_model.generate(
151
+ ref_pose, ref_pose_frame_dir
152
+ )
153
+ else:
154
+ ref_pose_coeff_path = None
155
+
156
+ # audio2ceoff
157
+ batch = get_data(
158
+ first_coeff_path,
159
+ args.audio_path,
160
+ device,
161
+ ref_eyeblink_coeff_path,
162
+ still=still,
163
+ )
164
+ coeff_path = self.audio_to_coeff.generate(
165
+ batch, results_dir, args.pose_style, ref_pose_coeff_path
166
+ )
167
+ # coeff2video
168
+ print("coeff2video")
169
+ data = get_facerender_data(
170
+ coeff_path,
171
+ crop_pic_path,
172
+ first_coeff_path,
173
+ args.audio_path,
174
+ args.batch_size,
175
+ args.input_yaw,
176
+ args.input_pitch,
177
+ args.input_roll,
178
+ expression_scale=args.expression_scale,
179
+ still_mode=still,
180
+ preprocess=preprocess,
181
+ )
182
+ animate_from_coeff.generate(
183
+ data, results_dir, args.pic_path, crop_info,
184
+ enhancer=enhancer, background_enhancer=args.background_enhancer,
185
+ preprocess=preprocess)
186
+
187
+ output = "/tmp/out.mp4"
188
+ mp4_path = os.path.join(results_dir, [f for f in os.listdir(results_dir) if "enhanced.mp4" in f][0])
189
+ shutil.copy(mp4_path, output)
190
+
191
+ return Path(output)
192
+
193
+
194
+ def load_default():
195
+ return Namespace(
196
+ pose_style=0,
197
+ batch_size=2,
198
+ expression_scale=1.0,
199
+ input_yaw=None,
200
+ input_pitch=None,
201
+ input_roll=None,
202
+ background_enhancer=None,
203
+ face3dvis=False,
204
+ net_recon="resnet50",
205
+ init_path=None,
206
+ use_last_fc=False,
207
+ bfm_folder="./checkpoints/BFM_Fitting/",
208
+ bfm_model="BFM_model_front.mat",
209
+ focal=1015.0,
210
+ center=112.0,
211
+ camera_d=10.0,
212
+ z_near=5.0,
213
+ z_far=15.0,
214
+ )
quick_demo.ipynb ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "M74Gs_TjYl_B"
7
+ },
8
+ "source": [
9
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb)"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {
15
+ "id": "view-in-github"
16
+ },
17
+ "source": [
18
+ "### SadTalker:Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation \n",
19
+ "\n",
20
+ "[arxiv](https://arxiv.org/abs/2211.12194) | [project](https://sadtalker.github.io) | [Github](https://github.com/Winfredy/SadTalker)\n",
21
+ "\n",
22
+ "Wenxuan Zhang, Xiaodong Cun, Xuan Wang, Yong Zhang, Xi Shen, Yu Guo, Ying Shan, Fei Wang.\n",
23
+ "\n",
24
+ "Xi'an Jiaotong University, Tencent AI Lab, Ant Group\n",
25
+ "\n",
26
+ "CVPR 2023\n",
27
+ "\n",
28
+ "TL;DR: A realistic and stylized talking head video generation method from a single image and audio\n"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "metadata": {
34
+ "id": "kA89DV-sKS4i"
35
+ },
36
+ "source": [
37
+ "Installation (around 5 mins)"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": null,
43
+ "metadata": {
44
+ "id": "qJ4CplXsYl_E"
45
+ },
46
+ "outputs": [],
47
+ "source": [
48
+ "### make sure that CUDA is available in Edit -> Nootbook settings -> GPU\n",
49
+ "!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": null,
55
+ "metadata": {
56
+ "id": "Mdq6j4E5KQAR"
57
+ },
58
+ "outputs": [],
59
+ "source": [
60
+ "!update-alternatives --install /usr/local/bin/python3 python3 /usr/bin/python3.8 2 \n",
61
+ "!update-alternatives --install /usr/local/bin/python3 python3 /usr/bin/python3.9 1 \n",
62
+ "!python --version \n",
63
+ "!apt-get update\n",
64
+ "!apt install software-properties-common\n",
65
+ "!sudo dpkg --remove --force-remove-reinstreq python3-pip python3-setuptools python3-wheel\n",
66
+ "!apt-get install python3-pip\n",
67
+ "\n",
68
+ "print('Git clone project and install requirements...')\n",
69
+ "!git clone https://github.com/Winfredy/SadTalker &> /dev/null\n",
70
+ "%cd SadTalker \n",
71
+ "!export PYTHONPATH=/content/SadTalker:$PYTHONPATH \n",
72
+ "!python3.8 -m pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113\n",
73
+ "!apt update\n",
74
+ "!apt install ffmpeg &> /dev/null \n",
75
+ "!python3.8 -m pip install -r requirements.txt"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "markdown",
80
+ "metadata": {
81
+ "id": "DddcKB_nKsnk"
82
+ },
83
+ "source": [
84
+ "Download models (1 mins)"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": null,
90
+ "metadata": {
91
+ "id": "eDw3_UN8K2xa"
92
+ },
93
+ "outputs": [],
94
+ "source": [
95
+ "print('Download pre-trained models...')\n",
96
+ "!rm -rf checkpoints\n",
97
+ "!bash scripts/download_models.sh"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": null,
103
+ "metadata": {
104
+ "id": "kK7DYeo7Yl_H"
105
+ },
106
+ "outputs": [],
107
+ "source": [
108
+ "# borrow from makeittalk\n",
109
+ "import ipywidgets as widgets\n",
110
+ "import glob\n",
111
+ "import matplotlib.pyplot as plt\n",
112
+ "print(\"Choose the image name to animate: (saved in folder 'examples/')\")\n",
113
+ "img_list = glob.glob1('examples/source_image', '*.png')\n",
114
+ "img_list.sort()\n",
115
+ "img_list = [item.split('.')[0] for item in img_list]\n",
116
+ "default_head_name = widgets.Dropdown(options=img_list, value='full3')\n",
117
+ "def on_change(change):\n",
118
+ " if change['type'] == 'change' and change['name'] == 'value':\n",
119
+ " plt.imshow(plt.imread('examples/source_image/{}.png'.format(default_head_name.value)))\n",
120
+ " plt.axis('off')\n",
121
+ " plt.show()\n",
122
+ "default_head_name.observe(on_change)\n",
123
+ "display(default_head_name)\n",
124
+ "plt.imshow(plt.imread('examples/source_image/{}.png'.format(default_head_name.value)))\n",
125
+ "plt.axis('off')\n",
126
+ "plt.show()"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "markdown",
131
+ "metadata": {
132
+ "id": "-khNZcnGK4UK"
133
+ },
134
+ "source": [
135
+ "Animation"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": null,
141
+ "metadata": {
142
+ "id": "ToBlDusjK5sS"
143
+ },
144
+ "outputs": [],
145
+ "source": [
146
+ "# selected audio from exmaple/driven_audio\n",
147
+ "img = 'examples/source_image/{}.png'.format(default_head_name.value)\n",
148
+ "print(img)\n",
149
+ "!python3.8 inference.py --driven_audio ./examples/driven_audio/RD_Radio31_000.wav \\\n",
150
+ " --source_image {img} \\\n",
151
+ " --result_dir ./results --still --preprocess full --enhancer gfpgan"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "metadata": {
158
+ "id": "fAjwGmKKYl_I"
159
+ },
160
+ "outputs": [],
161
+ "source": [
162
+ "# visualize code from makeittalk\n",
163
+ "from IPython.display import HTML\n",
164
+ "from base64 import b64encode\n",
165
+ "import os, sys\n",
166
+ "\n",
167
+ "# get the last from results\n",
168
+ "\n",
169
+ "results = sorted(os.listdir('./results/'))\n",
170
+ "\n",
171
+ "mp4_name = glob.glob('./results/'+results[-1]+'/*.mp4')[0]\n",
172
+ "\n",
173
+ "mp4 = open('{}'.format(mp4_name),'rb').read()\n",
174
+ "data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
175
+ "\n",
176
+ "print('Display animation: {}'.format(mp4_name), file=sys.stderr)\n",
177
+ "display(HTML(\"\"\"\n",
178
+ " <video width=256 controls>\n",
179
+ " <source src=\"%s\" type=\"video/mp4\">\n",
180
+ " </video>\n",
181
+ " \"\"\" % data_url))\n"
182
+ ]
183
+ }
184
+ ],
185
+ "metadata": {
186
+ "colab": {
187
+ "provenance": []
188
+ },
189
+ "kernelspec": {
190
+ "display_name": "base",
191
+ "language": "python",
192
+ "name": "python3"
193
+ },
194
+ "language_info": {
195
+ "name": "python",
196
+ "version": "3.9.7"
197
+ },
198
+ "vscode": {
199
+ "interpreter": {
200
+ "hash": "db5031b3636a3f037ea48eb287fd3d023feb9033aefc2a9652a92e470fb0851b"
201
+ }
202
+ },
203
+ "accelerator": "GPU",
204
+ "gpuClass": "standard"
205
+ },
206
+ "nbformat": 4,
207
+ "nbformat_minor": 0
208
+ }
requirements3d.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==1.23.4
2
+ face_alignment==1.3.5
3
+ imageio==2.19.3
4
+ imageio-ffmpeg==0.4.7
5
+ librosa==0.9.2 #
6
+ numba
7
+ resampy==0.3.1
8
+ pydub==0.25.1
9
+ scipy==1.5.3
10
+ kornia==0.6.8
11
+ tqdm
12
+ yacs==0.1.8
13
+ pyyaml
14
+ joblib==1.1.0
15
+ scikit-image==0.19.3
16
+ basicsr==1.4.2
17
+ facexlib==0.2.5
18
+ trimesh==3.9.20
19
+ dlib-bin
20
+ gradio
21
+ gfpgan
webui.bat ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+
3
+ IF NOT EXIST venv (
4
+ python -m venv venv
5
+ ) ELSE (
6
+ echo venv folder already exists, skipping creation...
7
+ )
8
+ call .\venv\Scripts\activate.bat
9
+
10
+ set PYTHON="venv\Scripts\Python.exe"
11
+ echo venv %PYTHON%
12
+
13
+ %PYTHON% Launcher.py
14
+
15
+ echo.
16
+ echo Launch unsuccessful. Exiting.
17
+ pause